3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c
40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile
40f56239DoibTX6R-ZYd3QTXAB8_TA linux-2.6.7-xen-sparse/drivers/xen/evtchn/evtchn.c
-40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
-40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
-405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/net/network.c
+410a9817HEVJvred5Oy_uKH3HFJC5Q linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
+4097ba831lpGeLlPg-bfV8XarVVuoQ linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
+4097ba83wvv8yi5P5xugCUBAdb6O-A linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
+4097ba83byY5bTSugJGZ1exTxIcMKw linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
+4087cf0dGmSbFhFZyIZBJzvqxY-qBw linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
+40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
+40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
+405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.7-xen-sparse/drivers/xen/privcmd/Makefile
3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c
40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/desc.h
40f5623cBiQhPHILVLrl3xa6bDBaRg linux-2.6.7-xen-sparse/include/asm-xen/xen.h
3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
+410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
401d7e160vaxMBAUSLSicuZ7AQjJ3w tools/examples/Makefile
ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c
cd ${AD}/arch/xen/drivers/netif/frontend
-ln -sf ../../../../../${LINUX_26}/drivers/xen/net/network.c main.c
+ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c
#include <linux/config.h>
#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/hypervisor.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#endif
idx = 0;
wmb(); /* Make sure index is cleared first to avoid double updates. */
- if (unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0))
- panic("Failed to execute MMU updates");
+ if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0) )
+ {
+ printk(KERN_ALERT "Failed to execute MMU updates.\n");
+ BUG();
+ }
}
void _flush_page_update_queue(void)
increment_index();
spin_unlock_irqrestore(&update_lock, flags);
}
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long *pfn_array;
+ unsigned long vstart;
+ unsigned long i;
+ int ret;
+ unsigned int order = get_order(pages*PAGE_SIZE);
+
+ vstart = __get_free_pages(GFP_KERNEL, order);
+ if ( vstart == 0 )
+ return 0UL;
+
+ pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+ if ( pfn_array == NULL )
+ BUG();
+
+ for ( i = 0; i < (1<<order); i++ )
+ {
+ pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE)));
+ pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+ pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+ pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
+ queue_l1_entry_update(pte, 0);
+ phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
+ }
+
+ flush_page_update_queue();
+
+ ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation,
+ pfn_array, 1<<order);
+ if ( unlikely(ret != (1<<order)) )
+ {
+ printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
+ BUG();
+ }
+
+ vfree(pfn_array);
+
+ return vstart;
+}
+
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long *pfn_array;
+ unsigned long i;
+ int ret;
+ unsigned int order = get_order(pages*PAGE_SIZE);
+
+ pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+ if ( pfn_array == NULL )
+ BUG();
+
+ ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+ pfn_array, 1<<order);
+ if ( unlikely(ret != (1<<order)) )
+ {
+ printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
+ ret);
+ BUG();
+ }
+
+ for ( i = 0; i < (1<<order); i++ )
+ {
+ pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE)));
+ pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+ pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+ queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
+ queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT);
+ phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
+ }
+
+ flush_page_update_queue();
+
+ vfree(pfn_array);
+
+ free_pages(vstart, order);
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
obj-y += block/
obj-y += console/
obj-y += evtchn/
-obj-y += net/
+obj-y += netfront/
obj-y += privcmd/
+
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += netback/
+++ /dev/null
-
-config XENNET
- tristate "Xen network driver"
- depends on NETDEVICES && ARCH_XEN
- help
- Network driver for Xen
+++ /dev/null
-
-obj-y := network.o
+++ /dev/null
-/******************************************************************************
- * Virtual network driver for conversing with remote driver backends.
- *
- * Copyright (c) 2002-2004, K A Fraser
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/version.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-#include <net/sock.h>
-#include <net/pkt_sched.h>
-
-#include <asm-xen/evtchn.h>
-#include <asm-xen/ctrl_if.h>
-
-#include <asm/page.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-#include <asm-xen/netif.h>
-#else
-#include "../netif.h"
-#define irqreturn_t void
-#define IRQ_HANDLED
-#endif
-
-#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
-
-static void network_tx_buf_gc(struct net_device *dev);
-static void network_alloc_rx_buffers(struct net_device *dev);
-
-static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
-static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
-static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
-
-static struct list_head dev_list;
-
-struct net_private
-{
- struct list_head list;
- struct net_device *dev;
-
- struct net_device_stats stats;
- NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
- unsigned int tx_full;
-
- netif_tx_interface_t *tx;
- netif_rx_interface_t *rx;
-
- spinlock_t tx_lock;
- spinlock_t rx_lock;
-
- unsigned int handle;
- unsigned int evtchn;
- unsigned int irq;
-
- /* What is the status of our connection to the remote backend? */
-#define BEST_CLOSED 0
-#define BEST_DISCONNECTED 1
-#define BEST_CONNECTED 2
- unsigned int backend_state;
-
- /* Is this interface open or closed (down or up)? */
-#define UST_CLOSED 0
-#define UST_OPEN 1
- unsigned int user_state;
-
- /*
- * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
- * array is an index into a chain of free entries.
- */
- struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
- struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
-};
-
-/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
-#define ADD_ID_TO_FREELIST(_list, _id) \
- (_list)[(_id)] = (_list)[0]; \
- (_list)[0] = (void *)(unsigned long)(_id);
-#define GET_ID_FROM_FREELIST(_list) \
- ({ unsigned long _id = (unsigned long)(_list)[0]; \
- (_list)[0] = (_list)[_id]; \
- (unsigned short)_id; })
-
-static struct net_device *find_dev_by_handle(unsigned int handle)
-{
- struct list_head *ent;
- struct net_private *np;
- list_for_each ( ent, &dev_list )
- {
- np = list_entry(ent, struct net_private, list);
- if ( np->handle == handle )
- return np->dev;
- }
- return NULL;
-}
-
-/** Network interface info. */
-struct netif_ctrl {
- /** Number of interfaces. */
- int interface_n;
- /** Number of connected interfaces. */
- int connected_n;
- /** Error code. */
- int err;
-};
-
-static struct netif_ctrl netctrl;
-
-static void netctrl_init(void)
-{
- memset(&netctrl, 0, sizeof(netctrl));
- netctrl.interface_n = -1;
-}
-
-/** Get or set a network interface error.
- */
-static int netctrl_err(int err)
-{
- if(err < 0 && !netctrl.err){
- netctrl.err = err;
- printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
- }
- return netctrl.err;
-}
-
-/** Test if all network interfaces are connected.
- *
- * @return 1 if all connected, 0 if not, negative error code otherwise
- */
-static int netctrl_connected(void)
-{
- int ok = 0;
- ok = (netctrl.err ? netctrl.err :
- (netctrl.connected_n == netctrl.interface_n));
- return ok;
-}
-
-/** Count the connected network interfaces.
- *
- * @return connected count
- */
-static int netctrl_connected_count(void)
-{
-
- struct list_head *ent;
- struct net_private *np;
- unsigned int connected;
-
- connected = 0;
-
- list_for_each(ent, &dev_list)
- {
- np = list_entry(ent, struct net_private, list);
- if ( np->backend_state == BEST_CONNECTED )
- connected++;
- }
-
- netctrl.connected_n = connected;
- return connected;
-}
-
-static int network_open(struct net_device *dev)
-{
- struct net_private *np = dev->priv;
-
- memset(&np->stats, 0, sizeof(np->stats));
-
- np->user_state = UST_OPEN;
-
- network_alloc_rx_buffers(dev);
- np->rx->event = np->rx_resp_cons + 1;
-
- netif_start_queue(dev);
-
- return 0;
-}
-
-
-static void network_tx_buf_gc(struct net_device *dev)
-{
- NETIF_RING_IDX i, prod;
- unsigned short id;
- struct net_private *np = dev->priv;
- struct sk_buff *skb;
-
- if ( np->backend_state != BEST_CONNECTED )
- return;
-
- do {
- prod = np->tx->resp_prod;
-
- for ( i = np->tx_resp_cons; i != prod; i++ )
- {
- id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
- skb = np->tx_skbs[id];
- ADD_ID_TO_FREELIST(np->tx_skbs, id);
- dev_kfree_skb_any(skb);
- }
-
- np->tx_resp_cons = prod;
-
- /*
- * Set a new event, then check for race with update of tx_cons. Note
- * that it is essential to schedule a callback, no matter how few
- * buffers are pending. Even if there is space in the transmit ring,
- * higher layers may be blocked because too much data is outstanding:
- * in such cases notification from Xen is likely to be the only kick
- * that we'll get.
- */
- np->tx->event =
- prod + ((np->tx->req_prod - prod) >> 1) + 1;
- mb();
- }
- while ( prod != np->tx->resp_prod );
-
- if ( np->tx_full &&
- ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
- {
- np->tx_full = 0;
- if ( np->user_state == UST_OPEN )
- netif_wake_queue(dev);
- }
-}
-
-
-static void network_alloc_rx_buffers(struct net_device *dev)
-{
- unsigned short id;
- struct net_private *np = dev->priv;
- struct sk_buff *skb;
- NETIF_RING_IDX i = np->rx->req_prod;
- int nr_pfns = 0;
-
- /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
- if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) ||
- unlikely(np->backend_state != BEST_CONNECTED) )
- return;
-
- do {
- skb = dev_alloc_skb(RX_BUF_SIZE);
- if ( unlikely(skb == NULL) )
- break;
-
- skb->dev = dev;
-
- if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
- panic("alloc_skb needs to provide us page-aligned buffers.");
-
- id = GET_ID_FROM_FREELIST(np->rx_skbs);
-
- np->rx_skbs[id] = skb;
-
- np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
-
- rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
-
- rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
- rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
- rx_mcl[nr_pfns].args[1] = 0;
- rx_mcl[nr_pfns].args[2] = 0;
-
- nr_pfns++;
- }
- while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
-
- if ( unlikely(nr_pfns == 0) )
- return;
-
- /*
- * We may have allocated buffers which have entries outstanding in the page
- * update queue -- make sure we flush those first!
- */
- flush_page_update_queue();
-
- /* After all PTEs have been zapped we blow away stale TLB entries. */
- rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
-
- /* Give away a batch of pages. */
- rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
- rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
- rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
- rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
-
- /* Zap PTEs and give away pages in one big multicall. */
- (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
-
- /* Check return status of HYPERVISOR_dom_mem_op(). */
- if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
- panic("Unable to reduce memory reservation\n");
-
- np->rx->req_prod = i;
-}
-
-
-static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- unsigned short id;
- struct net_private *np = (struct net_private *)dev->priv;
- netif_tx_request_t *tx;
- NETIF_RING_IDX i;
-
- if ( unlikely(np->tx_full) )
- {
- printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
- netif_stop_queue(dev);
- return -ENOBUFS;
- }
-
- if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
- PAGE_SIZE) )
- {
- struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
- if ( unlikely(new_skb == NULL) )
- return 1;
- skb_put(new_skb, skb->len);
- memcpy(new_skb->data, skb->data, skb->len);
- dev_kfree_skb(skb);
- skb = new_skb;
- }
-
- spin_lock_irq(&np->tx_lock);
-
- if ( np->backend_state != BEST_CONNECTED )
- {
- spin_unlock_irq(&np->tx_lock);
- return 1;
- }
-
- i = np->tx->req_prod;
-
- id = GET_ID_FROM_FREELIST(np->tx_skbs);
- np->tx_skbs[id] = skb;
-
- tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
-
- tx->id = id;
- tx->addr = virt_to_machine(skb->data);
- tx->size = skb->len;
-
- wmb();
- np->tx->req_prod = i + 1;
-
- network_tx_buf_gc(dev);
-
- if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
- {
- np->tx_full = 1;
- netif_stop_queue(dev);
- }
-
- spin_unlock_irq(&np->tx_lock);
-
- np->stats.tx_bytes += skb->len;
- np->stats.tx_packets++;
-
- /* Only notify Xen if there are no outstanding responses. */
- mb();
- if ( np->tx->resp_prod == i )
- notify_via_evtchn(np->evtchn);
-
- return 0;
-}
-
-
-static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
- struct net_device *dev = dev_id;
- struct net_private *np = dev->priv;
- unsigned long flags;
-
- spin_lock_irqsave(&np->tx_lock, flags);
- network_tx_buf_gc(dev);
- spin_unlock_irqrestore(&np->tx_lock, flags);
-
- if ( (np->rx_resp_cons != np->rx->resp_prod) &&
- (np->user_state == UST_OPEN) )
- netif_rx_schedule(dev);
-
- return IRQ_HANDLED;
-}
-
-
-static int netif_poll(struct net_device *dev, int *pbudget)
-{
- struct net_private *np = dev->priv;
- struct sk_buff *skb;
- netif_rx_response_t *rx;
- NETIF_RING_IDX i;
- mmu_update_t *mmu = rx_mmu;
- multicall_entry_t *mcl = rx_mcl;
- int work_done, budget, more_to_do = 1;
- struct sk_buff_head rxq;
- unsigned long flags;
-
- spin_lock(&np->rx_lock);
-
- if ( np->backend_state != BEST_CONNECTED )
- {
- spin_unlock(&np->rx_lock);
- return 0;
- }
-
- skb_queue_head_init(&rxq);
-
- if ( (budget = *pbudget) > dev->quota )
- budget = dev->quota;
-
- for ( i = np->rx_resp_cons, work_done = 0;
- (i != np->rx->resp_prod) && (work_done < budget);
- i++, work_done++ )
- {
- rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
-
- /*
- * An error here is very odd. Usually indicates a backend bug,
- * low-memory condition, or that we didn't have reservation headroom.
- * Whatever - print an error and queue the id again straight away.
- */
- if ( unlikely(rx->status <= 0) )
- {
- /* Gate this error. We get a (valid) slew of them on suspend. */
- if ( np->user_state == UST_OPEN )
- printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
- np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
- wmb();
- np->rx->req_prod++;
- continue;
- }
-
- skb = np->rx_skbs[rx->id];
- ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
-
- skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
- skb_put(skb, rx->status);
-
- np->stats.rx_packets++;
- np->stats.rx_bytes += rx->status;
-
- /* Remap the page. */
- mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
- mmu->val = __pa(skb->head) >> PAGE_SHIFT;
- mmu++;
- mcl->op = __HYPERVISOR_update_va_mapping;
- mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
- mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
- mcl->args[2] = 0;
- mcl++;
-
- phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] =
- rx->addr >> PAGE_SHIFT;
-
- __skb_queue_tail(&rxq, skb);
- }
-
- /* Do all the remapping work, and M->P updates, in one big hypercall. */
- if ( likely((mcl - rx_mcl) != 0) )
- {
- mcl->op = __HYPERVISOR_mmu_update;
- mcl->args[0] = (unsigned long)rx_mmu;
- mcl->args[1] = mmu - rx_mmu;
- mcl->args[2] = 0;
- mcl++;
- (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
- }
-
- while ( (skb = __skb_dequeue(&rxq)) != NULL )
- {
- /* Set the shared-info area, which is hidden behind the real data. */
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->frag_list = NULL;
-
- /* Ethernet-specific work. Delayed to here as it peeks the header. */
- skb->protocol = eth_type_trans(skb, dev);
-
- /* Pass it up. */
- netif_rx(skb);
- dev->last_rx = jiffies;
- }
-
- np->rx_resp_cons = i;
-
- network_alloc_rx_buffers(dev);
-
- *pbudget -= work_done;
- dev->quota -= work_done;
-
- if ( work_done < budget )
- {
- local_irq_save(flags);
-
- np->rx->event = i + 1;
-
- /* Deal with hypervisor racing our resetting of rx_event. */
- mb();
- if ( np->rx->resp_prod == i )
- {
- __netif_rx_complete(dev);
- more_to_do = 0;
- }
-
- local_irq_restore(flags);
- }
-
- spin_unlock(&np->rx_lock);
-
- return more_to_do;
-}
-
-
-static int network_close(struct net_device *dev)
-{
- struct net_private *np = dev->priv;
- np->user_state = UST_CLOSED;
- netif_stop_queue(np->dev);
- return 0;
-}
-
-
-static struct net_device_stats *network_get_stats(struct net_device *dev)
-{
- struct net_private *np = (struct net_private *)dev->priv;
- return &np->stats;
-}
-
-
-static void network_connect(struct net_device *dev,
- netif_fe_interface_status_changed_t *status)
-{
- struct net_private *np;
- int i, requeue_idx;
- netif_tx_request_t *tx;
-
- np = dev->priv;
- spin_lock_irq(&np->rx_lock);
- spin_lock(&np->tx_lock);
-
- /* Recovery procedure: */
-
- /* Step 1: Reinitialise variables. */
- np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
- np->rx->event = 1;
-
- /* Step 2: Rebuild the RX and TX ring contents.
- * NB. We could just free the queued TX packets now but we hope
- * that sending them out might do some good. We have to rebuild
- * the RX ring because some of our pages are currently flipped out
- * so we can't just free the RX skbs.
- * NB2. Freelist index entries are always going to be less than
- * __PAGE_OFFSET, whereas pointers to skbs will always be equal or
- * greater than __PAGE_OFFSET: we use this property to distinguish
- * them.
- */
-
- /* Rebuild the TX buffer freelist and the TX ring itself.
- * NB. This reorders packets. We could keep more private state
- * to avoid this but maybe it doesn't matter so much given the
- * interface has been down.
- */
- for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
- {
- if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
- {
- struct sk_buff *skb = np->tx_skbs[i];
-
- tx = &np->tx->ring[requeue_idx++].req;
-
- tx->id = i;
- tx->addr = virt_to_machine(skb->data);
- tx->size = skb->len;
-
- np->stats.tx_bytes += skb->len;
- np->stats.tx_packets++;
- }
- }
- wmb();
- np->tx->req_prod = requeue_idx;
-
- /* Rebuild the RX buffer freelist and the RX ring itself. */
- for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
- if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
- np->rx->ring[requeue_idx++].req.id = i;
- wmb();
- np->rx->req_prod = requeue_idx;
-
- /* Step 3: All public and private state should now be sane. Get
- * ready to start sending and receiving packets and give the driver
- * domain a kick because we've probably just requeued some
- * packets.
- */
- np->backend_state = BEST_CONNECTED;
- notify_via_evtchn(status->evtchn);
- network_tx_buf_gc(dev);
-
- if ( np->user_state == UST_OPEN )
- netif_start_queue(dev);
-
- spin_unlock(&np->tx_lock);
- spin_unlock_irq(&np->rx_lock);
-}
-
-static void netif_status_change(netif_fe_interface_status_changed_t *status)
-{
- ctrl_msg_t cmsg;
- netif_fe_interface_connect_t up;
- struct net_device *dev;
- struct net_private *np;
-
- if ( netctrl.interface_n <= 0 )
- {
- printk(KERN_WARNING "Status change: no interfaces\n");
- return;
- }
-
- dev = find_dev_by_handle(status->handle);
- if(!dev){
- printk(KERN_WARNING "Status change: invalid netif handle %u\n",
- status->handle);
- return;
- }
- np = dev->priv;
-
- switch ( status->status )
- {
- case NETIF_INTERFACE_STATUS_DESTROYED:
- printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
- np->backend_state);
- break;
-
- case NETIF_INTERFACE_STATUS_DISCONNECTED:
- if ( np->backend_state != BEST_CLOSED )
- {
- printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
- " in state %d\n", np->backend_state);
- printk(KERN_INFO "Attempting to reconnect network interface\n");
-
- /* Begin interface recovery.
- *
- * NB. Whilst we're recovering, we turn the carrier state off. We
- * take measures to ensure that this device isn't used for
- * anything. We also stop the queue for this device. Various
- * different approaches (e.g. continuing to buffer packets) have
- * been tested but don't appear to improve the overall impact on
- * TCP connections.
- *
- * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
- * is initiated by a special "RESET" message - disconnect could
- * just mean we're not allowed to use this interface any more.
- */
-
- /* Stop old i/f to prevent errors whilst we rebuild the state. */
- spin_lock_irq(&np->tx_lock);
- spin_lock(&np->rx_lock);
- netif_stop_queue(dev);
- np->backend_state = BEST_DISCONNECTED;
- spin_unlock(&np->rx_lock);
- spin_unlock_irq(&np->tx_lock);
-
- /* Free resources. */
- free_irq(np->irq, dev);
- unbind_evtchn_from_irq(np->evtchn);
- free_page((unsigned long)np->tx);
- free_page((unsigned long)np->rx);
- }
-
- /* Move from CLOSED to DISCONNECTED state. */
- np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
- np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
- memset(np->tx, 0, PAGE_SIZE);
- memset(np->rx, 0, PAGE_SIZE);
- np->backend_state = BEST_DISCONNECTED;
-
- /* Construct an interface-CONNECT message for the domain controller. */
- cmsg.type = CMSG_NETIF_FE;
- cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT;
- cmsg.length = sizeof(netif_fe_interface_connect_t);
- up.handle = status->handle;
- up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
- up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
- memcpy(cmsg.msg, &up, sizeof(up));
-
- /* Tell the controller to bring up the interface. */
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
- break;
-
- case NETIF_INTERFACE_STATUS_CONNECTED:
- if ( np->backend_state == BEST_CLOSED )
- {
- printk(KERN_WARNING "Unexpected netif-CONNECTED message"
- " in state %d\n", np->backend_state);
- break;
- }
-
- memcpy(dev->dev_addr, status->mac, ETH_ALEN);
-
- network_connect(dev, status);
-
- np->evtchn = status->evtchn;
- np->irq = bind_evtchn_to_irq(np->evtchn);
- (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM,
- dev->name, dev);
-
- netctrl_connected_count();
- break;
-
- default:
- printk(KERN_WARNING "Status change to unknown value %d\n",
- status->status);
- break;
- }
-}
-
-/** Create a network device.
- * @param handle device handle
- * @param val return parameter for created device
- * @return 0 on success, error code otherwise
- */
-static int create_netdev(int handle, struct net_device **val)
-{
- int i, err = 0;
- struct net_device *dev = NULL;
- struct net_private *np = NULL;
-
- if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
- {
- printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
- err = -ENOMEM;
- goto exit;
- }
-
- np = dev->priv;
- np->backend_state = BEST_CLOSED;
- np->user_state = UST_CLOSED;
- np->handle = handle;
-
- spin_lock_init(&np->tx_lock);
- spin_lock_init(&np->rx_lock);
-
- /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
- for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
- np->tx_skbs[i] = (void *)(i+1);
- for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
- np->rx_skbs[i] = (void *)(i+1);
-
- dev->open = network_open;
- dev->hard_start_xmit = network_start_xmit;
- dev->stop = network_close;
- dev->get_stats = network_get_stats;
- dev->poll = netif_poll;
- dev->weight = 64;
-
- if ( (err = register_netdev(dev)) != 0 )
- {
- printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
- goto exit;
- }
- np->dev = dev;
- list_add(&np->list, &dev_list);
-
- exit:
- if ( (err != 0) && (dev != NULL ) )
- kfree(dev);
- else if ( val != NULL )
- *val = dev;
- return err;
-}
-
-/*
- * Initialize the network control interface. Set the number of network devices
- * and create them.
- */
-static void netif_driver_status_change(
- netif_fe_driver_status_changed_t *status)
-{
- int err = 0;
- int i;
-
- netctrl.interface_n = status->nr_interfaces;
- netctrl.connected_n = 0;
-
- for ( i = 0; i < netctrl.interface_n; i++ )
- {
- if ( (err = create_netdev(i, NULL)) != 0 )
- {
- netctrl_err(err);
- break;
- }
- }
-}
-
-static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
- int respond = 1;
-
- switch ( msg->subtype )
- {
- case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
- if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
- goto error;
- netif_status_change((netif_fe_interface_status_changed_t *)
- &msg->msg[0]);
- break;
-
- case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
- if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
- goto error;
- netif_driver_status_change((netif_fe_driver_status_changed_t *)
- &msg->msg[0]);
- /* Message is a response */
- respond = 0;
- break;
-
- error:
- default:
- msg->length = 0;
- break;
- }
-
- if ( respond )
- ctrl_if_send_response(msg);
-}
-
-
-static int __init netif_init(void)
-{
- ctrl_msg_t cmsg;
- netif_fe_driver_status_changed_t st;
- int err = 0, wait_i, wait_n = 20;
-
- if ( (start_info.flags & SIF_INITDOMAIN) ||
- (start_info.flags & SIF_NET_BE_DOMAIN) )
- return 0;
-
- printk("Initialising Xen virtual ethernet frontend driver");
-
- INIT_LIST_HEAD(&dev_list);
-
- netctrl_init();
-
- (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
-
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_NETIF_FE;
- cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
- cmsg.length = sizeof(netif_fe_driver_status_changed_t);
- st.status = NETIF_DRIVER_STATUS_UP;
- st.nr_interfaces = 0;
- memcpy(cmsg.msg, &st, sizeof(st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-
- /* Wait for all interfaces to be connected. */
- for ( wait_i = 0; ; wait_i++)
- {
- if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
- {
- err = (err > 0) ? 0 : err;
- break;
- }
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
- }
-
- if ( err )
- ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
-
- return err;
-}
-
-__initcall(netif_init);
--- /dev/null
+
+obj-y := netback.o control.o interface.o
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/netif.h>
+
+#if 0
+#define ASSERT(_p) \
+ if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct netif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+
+ /* Physical parameters of the comms window. */
+ unsigned long tx_shmem_frame;
+ unsigned long rx_shmem_frame;
+ unsigned int evtchn;
+ int irq;
+
+ /* The shared rings and indexes. */
+ netif_tx_interface_t *tx;
+ netif_rx_interface_t *rx;
+
+ /* Private indexes into shared ring. */
+ NETIF_RING_IDX rx_req_cons;
+ NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */
+ NETIF_RING_IDX tx_req_cons;
+ NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
+
+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+ unsigned long credit_bytes;
+ unsigned long credit_usec;
+ unsigned long remaining_credit;
+ struct timer_list credit_timeout;
+
+ /* Miscellaneous private stuff. */
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ /*
+ * DISCONNECT response is deferred until pending requests are ack'ed.
+ * We therefore need to store the id from the original request.
+ */
+ u8 disconnect_rspid;
+ struct netif_st *hash_next;
+ struct list_head list; /* scheduling list */
+ atomic_t refcnt;
+ spinlock_t rx_lock, tx_lock;
+ struct net_device *dev;
+ struct net_device_stats stats;
+} netif_t;
+
+void netif_create(netif_be_create_t *create);
+void netif_destroy(netif_be_destroy_t *destroy);
+void netif_connect(netif_be_connect_t *connect);
+int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id);
+void __netif_disconnect_complete(netif_t *netif);
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle);
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b) \
+ do { \
+ if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+ __netif_disconnect_complete(_b); \
+ } while (0)
+
+void netif_interface_init(void);
+void netif_ctrlif_init(void);
+
+void netif_deschedule(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/control.c
+ *
+ * Routines for interfacing with the control plane.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ switch ( msg->subtype )
+ {
+ case CMSG_NETIF_BE_CREATE:
+ if ( msg->length != sizeof(netif_be_create_t) )
+ goto parse_error;
+ netif_create((netif_be_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_NETIF_BE_DESTROY:
+ if ( msg->length != sizeof(netif_be_destroy_t) )
+ goto parse_error;
+ netif_destroy((netif_be_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_NETIF_BE_CONNECT:
+ if ( msg->length != sizeof(netif_be_connect_t) )
+ goto parse_error;
+ netif_connect((netif_be_connect_t *)&msg->msg[0]);
+ break;
+ case CMSG_NETIF_BE_DISCONNECT:
+ if ( msg->length != sizeof(netif_be_disconnect_t) )
+ goto parse_error;
+ if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) )
+ return; /* Sending the response is deferred until later. */
+ break;
+ default:
+ goto parse_error;
+ }
+
+ ctrl_if_send_response(msg);
+ return;
+
+ parse_error:
+ DPRINTK("Parse error while reading message subtype %d, len %d\n",
+ msg->subtype, msg->length);
+ msg->length = 0;
+ ctrl_if_send_response(msg);
+}
+
+void netif_ctrlif_init(void)
+{
+ ctrl_msg_t cmsg;
+ netif_be_driver_status_changed_t st;
+
+ (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_NETIF_BE;
+ cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED;
+ cmsg.length = sizeof(netif_be_driver_status_changed_t);
+ st.status = NETIF_DRIVER_STATUS_UP;
+ memcpy(cmsg.msg, &st, sizeof(st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ *
+ * Network-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/rtnetlink.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+#define NETIF_HASHSZ 1024
+#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1))
+
+static netif_t *netif_hash[NETIF_HASHSZ];
+
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ netif_t *netif = netif_hash[NETIF_HASH(domid, handle)];
+ while ( (netif != NULL) &&
+ ((netif->domid != domid) || (netif->handle != handle)) )
+ netif = netif->hash_next;
+ return netif;
+}
+
+void __netif_disconnect_complete(netif_t *netif)
+{
+ ctrl_msg_t cmsg;
+ netif_be_disconnect_t disc;
+
+ /*
+ * These can't be done in __netif_disconnect() because at that point there
+ * may be outstanding requests at the disc whose asynchronous responses
+ * must still be notified to the remote driver.
+ */
+ unbind_evtchn_from_irq(netif->evtchn);
+ vfree(netif->tx); /* Frees netif->rx as well. */
+ rtnl_lock();
+ (void)dev_close(netif->dev);
+ rtnl_unlock();
+
+ /* Construct the deferred response message. */
+ cmsg.type = CMSG_NETIF_BE;
+ cmsg.subtype = CMSG_NETIF_BE_DISCONNECT;
+ cmsg.id = netif->disconnect_rspid;
+ cmsg.length = sizeof(netif_be_disconnect_t);
+ disc.domid = netif->domid;
+ disc.netif_handle = netif->handle;
+ disc.status = NETIF_BE_STATUS_OKAY;
+ memcpy(cmsg.msg, &disc, sizeof(disc));
+
+ /*
+ * Make sure message is constructed /before/ status change, because
+ * after the status change the 'netif' structure could be deallocated at
+ * any time. Also make sure we send the response /after/ status change,
+ * as otherwise a subsequent CONNECT request could spuriously fail if
+ * another CPU doesn't see the status change yet.
+ */
+ mb();
+ if ( netif->status != DISCONNECTING )
+ BUG();
+ netif->status = DISCONNECTED;
+ mb();
+
+ /* Send the successful response. */
+ ctrl_if_send_response(&cmsg);
+}
+
+void netif_create(netif_be_create_t *create)
+{
+ int err = 0;
+ domid_t domid = create->domid;
+ unsigned int handle = create->netif_handle;
+ struct net_device *dev;
+ netif_t **pnetif, *netif;
+ char name[IFNAMSIZ] = {};
+
+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+ dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+ if ( dev == NULL )
+ {
+ DPRINTK("Could not create netif: out of memory\n");
+ create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ netif = dev->priv;
+ memset(netif, 0, sizeof(*netif));
+ netif->domid = domid;
+ netif->handle = handle;
+ netif->status = DISCONNECTED;
+ spin_lock_init(&netif->rx_lock);
+ spin_lock_init(&netif->tx_lock);
+ atomic_set(&netif->refcnt, 0);
+ netif->dev = dev;
+
+ netif->credit_bytes = netif->remaining_credit = ~0UL;
+ netif->credit_usec = 0UL;
+ /*init_ac_timer(&new_vif->credit_timeout);*/
+
+ pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+ while ( *pnetif != NULL )
+ {
+ if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) )
+ {
+ DPRINTK("Could not create netif: already exists\n");
+ create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
+ kfree(dev);
+ return;
+ }
+ pnetif = &(*pnetif)->hash_next;
+ }
+
+ dev->hard_start_xmit = netif_be_start_xmit;
+ dev->get_stats = netif_be_get_stats;
+ memcpy(dev->dev_addr, create->mac, ETH_ALEN);
+
+ /* Disable queuing. */
+ dev->tx_queue_len = 0;
+
+ /* Force a different MAC from remote end. */
+ dev->dev_addr[2] ^= 1;
+
+ if ( (err = register_netdev(dev)) != 0 )
+ {
+ DPRINTK("Could not register new net device %s: err=%d\n",
+ dev->name, err);
+ create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+ kfree(dev);
+ return;
+ }
+
+ netif->hash_next = *pnetif;
+ *pnetif = netif;
+
+ DPRINTK("Successfully created netif\n");
+ create->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_destroy(netif_be_destroy_t *destroy)
+{
+ domid_t domid = destroy->domid;
+ unsigned int handle = destroy->netif_handle;
+ netif_t **pnetif, *netif;
+
+ pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+ while ( (netif = *pnetif) != NULL )
+ {
+ if ( (netif->domid == domid) && (netif->handle == handle) )
+ {
+ if ( netif->status != DISCONNECTED )
+ goto still_connected;
+ goto destroy;
+ }
+ pnetif = &netif->hash_next;
+ }
+
+ destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+
+ still_connected:
+ destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+ return;
+
+ destroy:
+ *pnetif = netif->hash_next;
+ unregister_netdev(netif->dev);
+ kfree(netif->dev);
+ destroy->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_connect(netif_be_connect_t *connect)
+{
+ domid_t domid = connect->domid;
+ unsigned int handle = connect->netif_handle;
+ unsigned int evtchn = connect->evtchn;
+ unsigned long tx_shmem_frame = connect->tx_shmem_frame;
+ unsigned long rx_shmem_frame = connect->rx_shmem_frame;
+ struct vm_struct *vma;
+ pgprot_t prot;
+ int error;
+ netif_t *netif;
+#if 0
+ struct net_device *eth0_dev;
+#endif
+
+ netif = netif_find_by_handle(domid, handle);
+ if ( unlikely(netif == NULL) )
+ {
+ DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n",
+ connect->domid, connect->netif_handle);
+ connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ if ( netif->status != DISCONNECTED )
+ {
+ connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+ return;
+ }
+
+ if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
+ {
+ connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+ error = direct_remap_area_pages(&init_mm,
+ VMALLOC_VMADDR(vma->addr),
+ tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+ prot, domid);
+ error |= direct_remap_area_pages(&init_mm,
+ VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
+ rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+ prot, domid);
+ if ( error != 0 )
+ {
+ if ( error == -ENOMEM )
+ connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+ else if ( error == -EFAULT )
+ connect->status = NETIF_BE_STATUS_MAPPING_ERROR;
+ else
+ connect->status = NETIF_BE_STATUS_ERROR;
+ vfree(vma->addr);
+ return;
+ }
+
+ netif->evtchn = evtchn;
+ netif->irq = bind_evtchn_to_irq(evtchn);
+ netif->tx_shmem_frame = tx_shmem_frame;
+ netif->rx_shmem_frame = rx_shmem_frame;
+ netif->tx =
+ (netif_tx_interface_t *)vma->addr;
+ netif->rx =
+ (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
+ netif->status = CONNECTED;
+ netif_get(netif);
+
+ rtnl_lock();
+ (void)dev_open(netif->dev);
+ rtnl_unlock();
+
+ (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
+ netif_start_queue(netif->dev);
+
+ connect->status = NETIF_BE_STATUS_OKAY;
+}
+
+int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+ domid_t domid = disconnect->domid;
+ unsigned int handle = disconnect->netif_handle;
+ netif_t *netif;
+
+ netif = netif_find_by_handle(domid, handle);
+ if ( unlikely(netif == NULL) )
+ {
+ DPRINTK("netif_disconnect attempted for non-existent netif"
+ " (%u,%u)\n", disconnect->domid, disconnect->netif_handle);
+ disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return 1; /* Caller will send response error message. */
+ }
+
+ if ( netif->status == CONNECTED )
+ {
+ netif->status = DISCONNECTING;
+ netif->disconnect_rspid = rsp_id;
+ wmb(); /* Let other CPUs see the status change. */
+ netif_stop_queue(netif->dev);
+ free_irq(netif->irq, netif);
+ netif_deschedule(netif);
+ netif_put(netif);
+ return 0; /* Caller should not send response message. */
+ }
+
+ disconnect->status = NETIF_BE_STATUS_OKAY;
+ return 1;
+}
+
+void netif_interface_init(void)
+{
+ memset(netif_hash, 0, sizeof(netif_hash));
+}
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/netif/frontend
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include "common.h"
+
+static void netif_page_release(struct page *page);
+static void make_tx_response(netif_t *netif,
+ u16 id,
+ s8 st);
+static int make_rx_response(netif_t *netif,
+ u16 id,
+ s8 st,
+ memory_t addr,
+ u16 size);
+
+static void net_tx_action(unsigned long unused);
+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
+static void net_rx_action(unsigned long unused);
+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
+static struct sk_buff_head rx_queue;
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3];
+static unsigned char rx_notify[NR_EVENT_CHANNELS];
+
+/* Don't currently gate addition of an interface to the tx scheduling list. */
+#define tx_work_exists(_if) (1)
+
+#define MAX_PENDING_REQS 256
+static unsigned long mmap_vstart;
+#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
+
+#define PKT_PROT_LEN (ETH_HLEN + 20)
+
+static struct {
+ netif_tx_request_t req;
+ netif_t *netif;
+} pending_tx_info[MAX_PENDING_REQS];
+static u16 pending_ring[MAX_PENDING_REQS];
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
+static u16 dealloc_ring[MAX_PENDING_REQS];
+static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
+static PEND_RING_IDX dealloc_prod, dealloc_cons;
+
+static struct sk_buff_head tx_queue;
+static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
+
+static struct list_head net_schedule_list;
+static spinlock_t net_schedule_list_lock;
+
+#define MAX_MFN_ALLOC 64
+static unsigned long mfn_list[MAX_MFN_ALLOC];
+static unsigned int alloc_index = 0;
+static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
+
+static void __refresh_mfn_list(void)
+{
+ int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+ mfn_list, MAX_MFN_ALLOC);
+ if ( unlikely(ret != MAX_MFN_ALLOC) )
+ BUG();
+ alloc_index = MAX_MFN_ALLOC;
+}
+
+static unsigned long get_new_mfn(void)
+{
+ unsigned long mfn, flags;
+ spin_lock_irqsave(&mfn_lock, flags);
+ if ( alloc_index == 0 )
+ __refresh_mfn_list();
+ mfn = mfn_list[--alloc_index];
+ spin_unlock_irqrestore(&mfn_lock, flags);
+ return mfn;
+}
+
+static void dealloc_mfn(unsigned long mfn)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&mfn_lock, flags);
+ if ( alloc_index != MAX_MFN_ALLOC )
+ mfn_list[alloc_index++] = mfn;
+ else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1) != 1 )
+ BUG();
+ spin_unlock_irqrestore(&mfn_lock, flags);
+}
+
+static inline void maybe_schedule_tx_action(void)
+{
+ smp_mb();
+ if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&net_schedule_list) )
+ tasklet_schedule(&net_tx_tasklet);
+}
+
+/*
+ * This is the primary RECEIVE function for a network interface.
+ * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
+ */
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ netif_t *netif = (netif_t *)dev->priv;
+
+ ASSERT(skb->dev == dev);
+
+ /* Drop the packet if the target domain has no receive buffers. */
+ if ( (netif->rx_req_cons == netif->rx->req_prod) ||
+ ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
+ goto drop;
+
+ /*
+ * We do not copy the packet unless:
+ * 1. The data is shared; or
+ * 2. It spans a page boundary; or
+ * 3. We cannot be sure the whole data page is allocated.
+ * The copying method is taken from skb_copy().
+ * NB. We also couldn't cope with fragmented packets, but we won't get
+ * any because we not advertise the NETIF_F_SG feature.
+ */
+ if ( skb_shared(skb) || skb_cloned(skb) ||
+ (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
+ ((skb->end - skb->head) < (PAGE_SIZE/2)) )
+ {
+ struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
+ int hlen = skb->data - skb->head;
+ if ( unlikely(nskb == NULL) )
+ goto drop;
+ skb_reserve(nskb, hlen);
+ __skb_put(nskb, skb->len);
+ (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
+ nskb->dev = skb->dev;
+ dev_kfree_skb(skb);
+ skb = nskb;
+ }
+
+ netif->rx_req_cons++;
+
+ skb_queue_tail(&rx_queue, skb);
+ tasklet_schedule(&net_rx_tasklet);
+
+ return 0;
+
+ drop:
+ netif->stats.rx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+ static struct net_device *eth0_dev = NULL;
+ if ( unlikely(eth0_dev == NULL) )
+ eth0_dev = __dev_get_by_name("eth0");
+ netif_rx_schedule(eth0_dev);
+}
+/*
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ * if ( xen_network_done() )
+ * tg3_enable_ints(tp);
+ */
+int xen_network_done(void)
+{
+ return skb_queue_empty(&rx_queue);
+}
+#endif
+
+static void net_rx_action(unsigned long unused)
+{
+ netif_t *netif;
+ s8 status;
+ u16 size, id, evtchn;
+ mmu_update_t *mmu;
+ multicall_entry_t *mcl;
+ unsigned long vdata, mdata, new_mfn;
+ struct sk_buff_head rxq;
+ struct sk_buff *skb;
+ u16 notify_list[NETIF_RX_RING_SIZE];
+ int notify_nr = 0;
+
+ skb_queue_head_init(&rxq);
+
+ mcl = rx_mcl;
+ mmu = rx_mmu;
+ while ( (skb = skb_dequeue(&rx_queue)) != NULL )
+ {
+ netif = (netif_t *)skb->dev->priv;
+ vdata = (unsigned long)skb->data;
+ mdata = virt_to_machine(vdata);
+ new_mfn = get_new_mfn();
+
+ mmu[0].ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+ mmu[0].val = __pa(vdata) >> PAGE_SHIFT;
+ mmu[1].val = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
+ mmu[1].ptr = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
+ mmu[1].ptr |= MMU_EXTENDED_COMMAND;
+ mmu[1].val |= MMUEXT_SET_SUBJECTDOM;
+ mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+ mmu[2].val = MMUEXT_REASSIGN_PAGE;
+
+ mcl[0].op = __HYPERVISOR_update_va_mapping;
+ mcl[0].args[0] = vdata >> PAGE_SHIFT;
+ mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
+ mcl[0].args[2] = 0;
+ mcl[1].op = __HYPERVISOR_mmu_update;
+ mcl[1].args[0] = (unsigned long)mmu;
+ mcl[1].args[1] = 3;
+ mcl[1].args[2] = 0;
+
+ mcl += 2;
+ mmu += 3;
+
+ __skb_queue_tail(&rxq, skb);
+
+ /* Filled the batch queue? */
+ if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
+ break;
+ }
+
+ if ( mcl == rx_mcl )
+ return;
+
+ mcl[-2].args[2] = UVMF_FLUSH_TLB;
+ if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) )
+ BUG();
+
+ mcl = rx_mcl;
+ mmu = rx_mmu;
+ while ( (skb = __skb_dequeue(&rxq)) != NULL )
+ {
+ netif = (netif_t *)skb->dev->priv;
+ size = skb->tail - skb->data;
+
+ /* Rederive the machine addresses. */
+ new_mfn = mcl[0].args[1] >> PAGE_SHIFT;
+ mdata = ((mmu[2].ptr & PAGE_MASK) |
+ ((unsigned long)skb->data & ~PAGE_MASK));
+
+ phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
+
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
+
+ netif->stats.rx_bytes += size;
+ netif->stats.rx_packets++;
+
+ /* The update_va_mapping() must not fail. */
+ if ( unlikely(mcl[0].args[5] != 0) )
+ BUG();
+
+ /* Check the reassignment error code. */
+ status = NETIF_RSP_OKAY;
+ if ( unlikely(mcl[1].args[5] != 0) )
+ {
+ DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid);
+ dealloc_mfn(mdata >> PAGE_SHIFT);
+ status = NETIF_RSP_ERROR;
+ }
+
+ evtchn = netif->evtchn;
+ id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id;
+ if ( make_rx_response(netif, id, status, mdata, size) &&
+ (rx_notify[evtchn] == 0) )
+ {
+ rx_notify[evtchn] = 1;
+ notify_list[notify_nr++] = evtchn;
+ }
+
+ dev_kfree_skb(skb);
+
+ mcl += 2;
+ mmu += 3;
+ }
+
+ while ( notify_nr != 0 )
+ {
+ evtchn = notify_list[--notify_nr];
+ rx_notify[evtchn] = 0;
+ notify_via_evtchn(evtchn);
+ }
+
+ /* More work to do? */
+ if ( !skb_queue_empty(&rx_queue) )
+ tasklet_schedule(&net_rx_tasklet);
+#if 0
+ else
+ xen_network_done_notify();
+#endif
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+ netif_t *netif = dev->priv;
+ return &netif->stats;
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+ return netif->list.next != NULL;
+}
+
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+ spin_lock_irq(&net_schedule_list_lock);
+ if ( likely(__on_net_schedule_list(netif)) )
+ {
+ list_del(&netif->list);
+ netif->list.next = NULL;
+ netif_put(netif);
+ }
+ spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+ if ( __on_net_schedule_list(netif) )
+ return;
+
+ spin_lock_irq(&net_schedule_list_lock);
+ if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
+ {
+ list_add_tail(&netif->list, &net_schedule_list);
+ netif_get(netif);
+ }
+ spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static inline void netif_schedule_work(netif_t *netif)
+{
+ if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+ ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+ {
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action();
+ }
+}
+
+void netif_deschedule(netif_t *netif)
+{
+ remove_from_net_schedule_list(netif);
+}
+
+#if 0
+static void tx_credit_callback(unsigned long data)
+{
+ netif_t *netif = (netif_t *)data;
+ netif->remaining_credit = netif->credit_bytes;
+ netif_schedule_work(netif);
+}
+#endif
+
+static void net_tx_action(unsigned long unused)
+{
+ struct list_head *ent;
+ struct sk_buff *skb;
+ netif_t *netif;
+ netif_tx_request_t txreq;
+ u16 pending_idx;
+ NETIF_RING_IDX i;
+ struct page *page;
+ multicall_entry_t *mcl;
+ PEND_RING_IDX dc, dp;
+
+ if ( (dc = dealloc_cons) == (dp = dealloc_prod) )
+ goto skip_dealloc;
+
+ mcl = tx_mcl;
+ while ( dc != dp )
+ {
+ pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
+ mcl[0].op = __HYPERVISOR_update_va_mapping;
+ mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+ mcl[0].args[1] = 0;
+ mcl[0].args[2] = 0;
+ mcl++;
+ }
+
+ mcl[-1].args[2] = UVMF_FLUSH_TLB;
+ if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+ BUG();
+
+ mcl = tx_mcl;
+ while ( dealloc_cons != dp )
+ {
+ /* The update_va_mapping() must not fail. */
+ if ( unlikely(mcl[0].args[5] != 0) )
+ BUG();
+
+ pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
+
+ netif = pending_tx_info[pending_idx].netif;
+
+ spin_lock(&netif->tx_lock);
+ make_tx_response(netif, pending_tx_info[pending_idx].req.id,
+ NETIF_RSP_OKAY);
+ spin_unlock(&netif->tx_lock);
+
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+
+ /*
+ * Scheduling checks must happen after the above response is posted.
+ * This avoids a possible race with a guest OS on another CPU.
+ */
+ mb();
+ if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+ ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+ add_to_net_schedule_list_tail(netif);
+
+ netif_put(netif);
+
+ mcl++;
+ }
+
+ skip_dealloc:
+ mcl = tx_mcl;
+ while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+ !list_empty(&net_schedule_list) )
+ {
+ /* Get a netif from the list with work to do. */
+ ent = net_schedule_list.next;
+ netif = list_entry(ent, netif_t, list);
+ netif_get(netif);
+ remove_from_net_schedule_list(netif);
+
+ /* Work to do? */
+ i = netif->tx_req_cons;
+ if ( (i == netif->tx->req_prod) ||
+ ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
+ {
+ netif_put(netif);
+ continue;
+ }
+ memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req,
+ sizeof(txreq));
+ netif->tx_req_cons++;
+
+#if 0
+ /* Credit-based scheduling. */
+ if ( tx.size > netif->remaining_credit )
+ {
+ s_time_t now = NOW(), next_credit =
+ netif->credit_timeout.expires + MICROSECS(netif->credit_usec);
+ if ( next_credit <= now )
+ {
+ netif->credit_timeout.expires = now;
+ netif->remaining_credit = netif->credit_bytes;
+ }
+ else
+ {
+ netif->remaining_credit = 0;
+ netif->credit_timeout.expires = next_credit;
+ netif->credit_timeout.data = (unsigned long)netif;
+ netif->credit_timeout.function = tx_credit_callback;
+ netif->credit_timeout.cpu = smp_processor_id();
+ add_ac_timer(&netif->credit_timeout);
+ break;
+ }
+ }
+ netif->remaining_credit -= tx.size;
+#endif
+
+ netif_schedule_work(netif);
+
+ if ( unlikely(txreq.size <= PKT_PROT_LEN) ||
+ unlikely(txreq.size > ETH_FRAME_LEN) )
+ {
+ DPRINTK("Bad packet size: %d\n", txreq.size);
+ make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+ netif_put(netif);
+ continue;
+ }
+
+ /* No crossing a page boundary as the payload mustn't fragment. */
+ if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) )
+ {
+ DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n",
+ txreq.addr, txreq.size,
+ (txreq.addr &~PAGE_MASK) + txreq.size);
+ make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+ netif_put(netif);
+ continue;
+ }
+
+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+ if ( unlikely((skb = alloc_skb(PKT_PROT_LEN+16, GFP_ATOMIC)) == NULL) )
+ {
+ DPRINTK("Can't allocate a skb in start_xmit.\n");
+ make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+ netif_put(netif);
+ break;
+ }
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, 16);
+
+ mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
+ mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+ mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
+ mcl[0].args[2] = 0;
+ mcl[0].args[3] = netif->domid;
+ mcl++;
+
+ memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq));
+ pending_tx_info[pending_idx].netif = netif;
+ *((u16 *)skb->data) = pending_idx;
+
+ __skb_queue_tail(&tx_queue, skb);
+
+ pending_cons++;
+
+ /* Filled the batch queue? */
+ if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
+ break;
+ }
+
+ if ( mcl == tx_mcl )
+ return;
+
+ if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+ BUG();
+
+ mcl = tx_mcl;
+ while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
+ {
+ pending_idx = *((u16 *)skb->data);
+ netif = pending_tx_info[pending_idx].netif;
+ memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq));
+
+ /* Check the remap error code. */
+ if ( unlikely(mcl[0].args[5] != 0) )
+ {
+ DPRINTK("Bad page frame\n");
+ make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+ netif_put(netif);
+ kfree_skb(skb);
+ mcl++;
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ continue;
+ }
+
+ phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
+ txreq.addr >> PAGE_SHIFT;
+
+ __skb_put(skb, PKT_PROT_LEN);
+ memcpy(skb->data,
+ (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
+ PKT_PROT_LEN);
+
+ page = virt_to_page(MMAP_VADDR(pending_idx));
+
+ /* Append the packet payload as a fragment. */
+ skb_shinfo(skb)->frags[0].page = page;
+ skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN;
+ skb_shinfo(skb)->frags[0].page_offset =
+ (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
+ skb_shinfo(skb)->nr_frags = 1;
+ skb->data_len = txreq.size - PKT_PROT_LEN;
+ skb->len += skb->data_len;
+
+ skb->dev = netif->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ /*
+ * Destructor information. We hideously abuse the 'mapping' pointer,
+ * which isn't otherwise used by us. The page deallocator is modified
+ * to interpret a non-NULL value as a destructor function to be called.
+ * This works okay because in all other cases the pointer must be NULL
+ * when the page is freed (normally Linux will explicitly bug out if
+ * it sees otherwise.
+ */
+ page->mapping = (struct address_space *)netif_page_release;
+ set_page_count(page, 1);
+
+ netif->stats.tx_bytes += txreq.size;
+ netif->stats.tx_packets++;
+
+ netif_rx(skb);
+ netif->dev->last_rx = jiffies;
+
+ mcl++;
+ }
+}
+
+static void netif_page_release(struct page *page)
+{
+ unsigned long flags;
+ u16 pending_idx = page - virt_to_page(mmap_vstart);
+
+ /* Stop the abuse. */
+ page->mapping = NULL;
+
+ spin_lock_irqsave(&dealloc_lock, flags);
+ dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+ spin_unlock_irqrestore(&dealloc_lock, flags);
+
+ tasklet_schedule(&net_tx_tasklet);
+}
+
+#if 0
+long flush_bufs_for_netif(netif_t *netif)
+{
+ NETIF_RING_IDX i;
+
+ /* Return any outstanding receive buffers to the guest OS. */
+ spin_lock(&netif->rx_lock);
+ for ( i = netif->rx_req_cons;
+ (i != netif->rx->req_prod) &&
+ ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE);
+ i++ )
+ {
+ make_rx_response(netif,
+ netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id,
+ NETIF_RSP_DROPPED, 0, 0);
+ }
+ netif->rx_req_cons = i;
+ spin_unlock(&netif->rx_lock);
+
+ /*
+ * Flush pending transmit buffers. The guest may still have to wait for
+ * buffers that are queued at a physical NIC.
+ */
+ spin_lock(&netif->tx_lock);
+ for ( i = netif->tx_req_cons;
+ (i != netif->tx->req_prod) &&
+ ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE);
+ i++ )
+ {
+ make_tx_response(netif,
+ netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id,
+ NETIF_RSP_DROPPED);
+ }
+ netif->tx_req_cons = i;
+ spin_unlock(&netif->tx_lock);
+
+ return 0;
+}
+#endif
+
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ netif_t *netif = dev_id;
+ if ( tx_work_exists(netif) )
+ {
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action();
+ }
+ return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif,
+ u16 id,
+ s8 st)
+{
+ NETIF_RING_IDX i = netif->tx_resp_prod;
+ netif_tx_response_t *resp;
+
+ resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp;
+ resp->id = id;
+ resp->status = st;
+ wmb();
+ netif->tx->resp_prod = netif->tx_resp_prod = ++i;
+
+ mb(); /* Update producer before checking event threshold. */
+ if ( i == netif->tx->event )
+ notify_via_evtchn(netif->evtchn);
+}
+
+static int make_rx_response(netif_t *netif,
+ u16 id,
+ s8 st,
+ memory_t addr,
+ u16 size)
+{
+ NETIF_RING_IDX i = netif->rx_resp_prod;
+ netif_rx_response_t *resp;
+
+ resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+ resp->addr = addr;
+ resp->id = id;
+ resp->status = (s16)size;
+ if ( st < 0 )
+ resp->status = (s16)st;
+ wmb();
+ netif->rx->resp_prod = netif->rx_resp_prod = ++i;
+
+ mb(); /* Update producer before checking event threshold. */
+ return (i == netif->rx->event);
+}
+
+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
+ struct list_head *ent;
+ netif_t *netif;
+ int i = 0;
+
+ printk(KERN_ALERT "netif_schedule_list:\n");
+ spin_lock_irq(&net_schedule_list_lock);
+
+ list_for_each ( ent, &net_schedule_list )
+ {
+ netif = list_entry(ent, netif_t, list);
+ printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n",
+ i, netif->rx_req_cons, netif->rx_resp_prod);
+ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
+ netif->tx_req_cons, netif->tx_resp_prod);
+ printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n",
+ netif->rx->req_prod, netif->rx->resp_prod);
+ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
+ netif->rx->event, netif->tx->req_prod);
+ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
+ netif->tx->resp_prod, netif->tx->event);
+ i++;
+ }
+
+ spin_unlock_irq(&net_schedule_list_lock);
+ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+
+ return IRQ_HANDLED;
+}
+
+static int __init netback_init(void)
+{
+ int i;
+
+ if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
+ !(start_info.flags & SIF_INITDOMAIN) )
+ return 0;
+
+ printk("Initialising Xen netif backend\n");
+
+ skb_queue_head_init(&rx_queue);
+ skb_queue_head_init(&tx_queue);
+
+ netif_interface_init();
+
+ if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+ BUG();
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ spin_lock_init(&net_schedule_list_lock);
+ INIT_LIST_HEAD(&net_schedule_list);
+
+ netif_ctrlif_init();
+
+ (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG),
+ netif_be_dbg, SA_SHIRQ,
+ "net-be-dbg", &netif_be_dbg);
+
+ return 0;
+}
+
+static void netback_cleanup(void)
+{
+ BUG();
+}
+
+module_init(netback_init);
+module_exit(netback_cleanup);
--- /dev/null
+
+config XENNET
+ tristate "Xen network driver"
+ depends on NETDEVICES && ARCH_XEN
+ help
+ Network driver for Xen
--- /dev/null
+
+obj-y := netfront.o
--- /dev/null
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+#include <asm-xen/evtchn.h>
+#include <asm-xen/ctrl_if.h>
+
+#include <asm/page.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <asm-xen/netif.h>
+#else
+#include "../netif.h"
+#define irqreturn_t void
+#define IRQ_HANDLED
+#endif
+
+#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
+
+static void network_tx_buf_gc(struct net_device *dev);
+static void network_alloc_rx_buffers(struct net_device *dev);
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+static struct list_head dev_list;
+
+struct net_private
+{
+ struct list_head list;
+ struct net_device *dev;
+
+ struct net_device_stats stats;
+ NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
+ unsigned int tx_full;
+
+ netif_tx_interface_t *tx;
+ netif_rx_interface_t *rx;
+
+ spinlock_t tx_lock;
+ spinlock_t rx_lock;
+
+ unsigned int handle;
+ unsigned int evtchn;
+ unsigned int irq;
+
+ /* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED 0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED 2
+ unsigned int backend_state;
+
+ /* Is this interface open or closed (down or up)? */
+#define UST_CLOSED 0
+#define UST_OPEN 1
+ unsigned int user_state;
+
+ /*
+ * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
+ * array is an index into a chain of free entries.
+ */
+ struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
+ struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
+};
+
+/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
+#define ADD_ID_TO_FREELIST(_list, _id) \
+ (_list)[(_id)] = (_list)[0]; \
+ (_list)[0] = (void *)(unsigned long)(_id);
+#define GET_ID_FROM_FREELIST(_list) \
+ ({ unsigned long _id = (unsigned long)(_list)[0]; \
+ (_list)[0] = (_list)[_id]; \
+ (unsigned short)_id; })
+
+static struct net_device *find_dev_by_handle(unsigned int handle)
+{
+ struct list_head *ent;
+ struct net_private *np;
+ list_for_each ( ent, &dev_list )
+ {
+ np = list_entry(ent, struct net_private, list);
+ if ( np->handle == handle )
+ return np->dev;
+ }
+ return NULL;
+}
+
+/** Network interface info. */
+struct netif_ctrl {
+ /** Number of interfaces. */
+ int interface_n;
+ /** Number of connected interfaces. */
+ int connected_n;
+ /** Error code. */
+ int err;
+};
+
+static struct netif_ctrl netctrl;
+
+static void netctrl_init(void)
+{
+ memset(&netctrl, 0, sizeof(netctrl));
+ netctrl.interface_n = -1;
+}
+
+/** Get or set a network interface error.
+ */
+static int netctrl_err(int err)
+{
+ if(err < 0 && !netctrl.err){
+ netctrl.err = err;
+ printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
+ }
+ return netctrl.err;
+}
+
+/** Test if all network interfaces are connected.
+ *
+ * @return 1 if all connected, 0 if not, negative error code otherwise
+ */
+static int netctrl_connected(void)
+{
+ int ok = 0;
+ ok = (netctrl.err ? netctrl.err :
+ (netctrl.connected_n == netctrl.interface_n));
+ return ok;
+}
+
+/** Count the connected network interfaces.
+ *
+ * @return connected count
+ */
+static int netctrl_connected_count(void)
+{
+
+ struct list_head *ent;
+ struct net_private *np;
+ unsigned int connected;
+
+ connected = 0;
+
+ list_for_each(ent, &dev_list)
+ {
+ np = list_entry(ent, struct net_private, list);
+ if ( np->backend_state == BEST_CONNECTED )
+ connected++;
+ }
+
+ netctrl.connected_n = connected;
+ return connected;
+}
+
+static int network_open(struct net_device *dev)
+{
+ struct net_private *np = dev->priv;
+
+ memset(&np->stats, 0, sizeof(np->stats));
+
+ np->user_state = UST_OPEN;
+
+ network_alloc_rx_buffers(dev);
+ np->rx->event = np->rx_resp_cons + 1;
+
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+ NETIF_RING_IDX i, prod;
+ unsigned short id;
+ struct net_private *np = dev->priv;
+ struct sk_buff *skb;
+
+ if ( np->backend_state != BEST_CONNECTED )
+ return;
+
+ do {
+ prod = np->tx->resp_prod;
+
+ for ( i = np->tx_resp_cons; i != prod; i++ )
+ {
+ id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
+ skb = np->tx_skbs[id];
+ ADD_ID_TO_FREELIST(np->tx_skbs, id);
+ dev_kfree_skb_any(skb);
+ }
+
+ np->tx_resp_cons = prod;
+
+ /*
+ * Set a new event, then check for race with update of tx_cons. Note
+ * that it is essential to schedule a callback, no matter how few
+ * buffers are pending. Even if there is space in the transmit ring,
+ * higher layers may be blocked because too much data is outstanding:
+ * in such cases notification from Xen is likely to be the only kick
+ * that we'll get.
+ */
+ np->tx->event =
+ prod + ((np->tx->req_prod - prod) >> 1) + 1;
+ mb();
+ }
+ while ( prod != np->tx->resp_prod );
+
+ if ( np->tx_full &&
+ ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
+ {
+ np->tx_full = 0;
+ if ( np->user_state == UST_OPEN )
+ netif_wake_queue(dev);
+ }
+}
+
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+ unsigned short id;
+ struct net_private *np = dev->priv;
+ struct sk_buff *skb;
+ NETIF_RING_IDX i = np->rx->req_prod;
+ int nr_pfns = 0;
+
+ /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
+ if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) ||
+ unlikely(np->backend_state != BEST_CONNECTED) )
+ return;
+
+ do {
+ skb = dev_alloc_skb(RX_BUF_SIZE);
+ if ( unlikely(skb == NULL) )
+ break;
+
+ skb->dev = dev;
+
+ if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
+ panic("alloc_skb needs to provide us page-aligned buffers.");
+
+ id = GET_ID_FROM_FREELIST(np->rx_skbs);
+
+ np->rx_skbs[id] = skb;
+
+ np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
+
+ rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
+
+ rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+ rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+ rx_mcl[nr_pfns].args[1] = 0;
+ rx_mcl[nr_pfns].args[2] = 0;
+
+ nr_pfns++;
+ }
+ while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
+
+ if ( unlikely(nr_pfns == 0) )
+ return;
+
+ /*
+ * We may have allocated buffers which have entries outstanding in the page
+ * update queue -- make sure we flush those first!
+ */
+ flush_page_update_queue();
+
+ /* After all PTEs have been zapped we blow away stale TLB entries. */
+ rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+ /* Give away a batch of pages. */
+ rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+ rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+ rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+ rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+
+ /* Zap PTEs and give away pages in one big multicall. */
+ (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+ /* Check return status of HYPERVISOR_dom_mem_op(). */
+ if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+ panic("Unable to reduce memory reservation\n");
+
+ np->rx->req_prod = i;
+}
+
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned short id;
+ struct net_private *np = (struct net_private *)dev->priv;
+ netif_tx_request_t *tx;
+ NETIF_RING_IDX i;
+
+ if ( unlikely(np->tx_full) )
+ {
+ printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
+ netif_stop_queue(dev);
+ return -ENOBUFS;
+ }
+
+ if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+ PAGE_SIZE) )
+ {
+ struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
+ if ( unlikely(new_skb == NULL) )
+ return 1;
+ skb_put(new_skb, skb->len);
+ memcpy(new_skb->data, skb->data, skb->len);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ }
+
+ spin_lock_irq(&np->tx_lock);
+
+ if ( np->backend_state != BEST_CONNECTED )
+ {
+ spin_unlock_irq(&np->tx_lock);
+ return 1;
+ }
+
+ i = np->tx->req_prod;
+
+ id = GET_ID_FROM_FREELIST(np->tx_skbs);
+ np->tx_skbs[id] = skb;
+
+ tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
+
+ tx->id = id;
+ tx->addr = virt_to_machine(skb->data);
+ tx->size = skb->len;
+
+ wmb();
+ np->tx->req_prod = i + 1;
+
+ network_tx_buf_gc(dev);
+
+ if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
+ {
+ np->tx_full = 1;
+ netif_stop_queue(dev);
+ }
+
+ spin_unlock_irq(&np->tx_lock);
+
+ np->stats.tx_bytes += skb->len;
+ np->stats.tx_packets++;
+
+ /* Only notify Xen if there are no outstanding responses. */
+ mb();
+ if ( np->tx->resp_prod == i )
+ notify_via_evtchn(np->evtchn);
+
+ return 0;
+}
+
+
+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+ struct net_device *dev = dev_id;
+ struct net_private *np = dev->priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&np->tx_lock, flags);
+ network_tx_buf_gc(dev);
+ spin_unlock_irqrestore(&np->tx_lock, flags);
+
+ if ( (np->rx_resp_cons != np->rx->resp_prod) &&
+ (np->user_state == UST_OPEN) )
+ netif_rx_schedule(dev);
+
+ return IRQ_HANDLED;
+}
+
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+ struct net_private *np = dev->priv;
+ struct sk_buff *skb;
+ netif_rx_response_t *rx;
+ NETIF_RING_IDX i;
+ mmu_update_t *mmu = rx_mmu;
+ multicall_entry_t *mcl = rx_mcl;
+ int work_done, budget, more_to_do = 1;
+ struct sk_buff_head rxq;
+ unsigned long flags;
+
+ spin_lock(&np->rx_lock);
+
+ if ( np->backend_state != BEST_CONNECTED )
+ {
+ spin_unlock(&np->rx_lock);
+ return 0;
+ }
+
+ skb_queue_head_init(&rxq);
+
+ if ( (budget = *pbudget) > dev->quota )
+ budget = dev->quota;
+
+ for ( i = np->rx_resp_cons, work_done = 0;
+ (i != np->rx->resp_prod) && (work_done < budget);
+ i++, work_done++ )
+ {
+ rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+
+ /*
+ * An error here is very odd. Usually indicates a backend bug,
+ * low-memory condition, or that we didn't have reservation headroom.
+ * Whatever - print an error and queue the id again straight away.
+ */
+ if ( unlikely(rx->status <= 0) )
+ {
+ /* Gate this error. We get a (valid) slew of them on suspend. */
+ if ( np->user_state == UST_OPEN )
+ printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
+ np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
+ wmb();
+ np->rx->req_prod++;
+ continue;
+ }
+
+ skb = np->rx_skbs[rx->id];
+ ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
+
+ skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
+ skb_put(skb, rx->status);
+
+ np->stats.rx_packets++;
+ np->stats.rx_bytes += rx->status;
+
+ /* Remap the page. */
+ mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
+ mmu->val = __pa(skb->head) >> PAGE_SHIFT;
+ mmu++;
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+ mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
+ mcl->args[2] = 0;
+ mcl++;
+
+ phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] =
+ rx->addr >> PAGE_SHIFT;
+
+ __skb_queue_tail(&rxq, skb);
+ }
+
+ /* Do all the remapping work, and M->P updates, in one big hypercall. */
+ if ( likely((mcl - rx_mcl) != 0) )
+ {
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)rx_mmu;
+ mcl->args[1] = mmu - rx_mmu;
+ mcl->args[2] = 0;
+ mcl++;
+ (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+ }
+
+ while ( (skb = __skb_dequeue(&rxq)) != NULL )
+ {
+ /* Set the shared-info area, which is hidden behind the real data. */
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
+
+ /* Ethernet-specific work. Delayed to here as it peeks the header. */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Pass it up. */
+ netif_rx(skb);
+ dev->last_rx = jiffies;
+ }
+
+ np->rx_resp_cons = i;
+
+ network_alloc_rx_buffers(dev);
+
+ *pbudget -= work_done;
+ dev->quota -= work_done;
+
+ if ( work_done < budget )
+ {
+ local_irq_save(flags);
+
+ np->rx->event = i + 1;
+
+ /* Deal with hypervisor racing our resetting of rx_event. */
+ mb();
+ if ( np->rx->resp_prod == i )
+ {
+ __netif_rx_complete(dev);
+ more_to_do = 0;
+ }
+
+ local_irq_restore(flags);
+ }
+
+ spin_unlock(&np->rx_lock);
+
+ return more_to_do;
+}
+
+
+static int network_close(struct net_device *dev)
+{
+ struct net_private *np = dev->priv;
+ np->user_state = UST_CLOSED;
+ netif_stop_queue(np->dev);
+ return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+ struct net_private *np = (struct net_private *)dev->priv;
+ return &np->stats;
+}
+
+
+static void network_connect(struct net_device *dev,
+ netif_fe_interface_status_changed_t *status)
+{
+ struct net_private *np;
+ int i, requeue_idx;
+ netif_tx_request_t *tx;
+
+ np = dev->priv;
+ spin_lock_irq(&np->rx_lock);
+ spin_lock(&np->tx_lock);
+
+ /* Recovery procedure: */
+
+ /* Step 1: Reinitialise variables. */
+ np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
+ np->rx->event = 1;
+
+ /* Step 2: Rebuild the RX and TX ring contents.
+ * NB. We could just free the queued TX packets now but we hope
+ * that sending them out might do some good. We have to rebuild
+ * the RX ring because some of our pages are currently flipped out
+ * so we can't just free the RX skbs.
+ * NB2. Freelist index entries are always going to be less than
+ * __PAGE_OFFSET, whereas pointers to skbs will always be equal or
+ * greater than __PAGE_OFFSET: we use this property to distinguish
+ * them.
+ */
+
+ /* Rebuild the TX buffer freelist and the TX ring itself.
+ * NB. This reorders packets. We could keep more private state
+ * to avoid this but maybe it doesn't matter so much given the
+ * interface has been down.
+ */
+ for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
+ {
+ if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
+ {
+ struct sk_buff *skb = np->tx_skbs[i];
+
+ tx = &np->tx->ring[requeue_idx++].req;
+
+ tx->id = i;
+ tx->addr = virt_to_machine(skb->data);
+ tx->size = skb->len;
+
+ np->stats.tx_bytes += skb->len;
+ np->stats.tx_packets++;
+ }
+ }
+ wmb();
+ np->tx->req_prod = requeue_idx;
+
+ /* Rebuild the RX buffer freelist and the RX ring itself. */
+ for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
+ if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
+ np->rx->ring[requeue_idx++].req.id = i;
+ wmb();
+ np->rx->req_prod = requeue_idx;
+
+ /* Step 3: All public and private state should now be sane. Get
+ * ready to start sending and receiving packets and give the driver
+ * domain a kick because we've probably just requeued some
+ * packets.
+ */
+ np->backend_state = BEST_CONNECTED;
+ notify_via_evtchn(status->evtchn);
+ network_tx_buf_gc(dev);
+
+ if ( np->user_state == UST_OPEN )
+ netif_start_queue(dev);
+
+ spin_unlock(&np->tx_lock);
+ spin_unlock_irq(&np->rx_lock);
+}
+
+static void netif_status_change(netif_fe_interface_status_changed_t *status)
+{
+ ctrl_msg_t cmsg;
+ netif_fe_interface_connect_t up;
+ struct net_device *dev;
+ struct net_private *np;
+
+ if ( netctrl.interface_n <= 0 )
+ {
+ printk(KERN_WARNING "Status change: no interfaces\n");
+ return;
+ }
+
+ dev = find_dev_by_handle(status->handle);
+ if(!dev){
+ printk(KERN_WARNING "Status change: invalid netif handle %u\n",
+ status->handle);
+ return;
+ }
+ np = dev->priv;
+
+ switch ( status->status )
+ {
+ case NETIF_INTERFACE_STATUS_DESTROYED:
+ printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
+ np->backend_state);
+ break;
+
+ case NETIF_INTERFACE_STATUS_DISCONNECTED:
+ if ( np->backend_state != BEST_CLOSED )
+ {
+ printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
+ " in state %d\n", np->backend_state);
+ printk(KERN_INFO "Attempting to reconnect network interface\n");
+
+ /* Begin interface recovery.
+ *
+ * NB. Whilst we're recovering, we turn the carrier state off. We
+ * take measures to ensure that this device isn't used for
+ * anything. We also stop the queue for this device. Various
+ * different approaches (e.g. continuing to buffer packets) have
+ * been tested but don't appear to improve the overall impact on
+ * TCP connections.
+ *
+ * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
+ * is initiated by a special "RESET" message - disconnect could
+ * just mean we're not allowed to use this interface any more.
+ */
+
+ /* Stop old i/f to prevent errors whilst we rebuild the state. */
+ spin_lock_irq(&np->tx_lock);
+ spin_lock(&np->rx_lock);
+ netif_stop_queue(dev);
+ np->backend_state = BEST_DISCONNECTED;
+ spin_unlock(&np->rx_lock);
+ spin_unlock_irq(&np->tx_lock);
+
+ /* Free resources. */
+ free_irq(np->irq, dev);
+ unbind_evtchn_from_irq(np->evtchn);
+ free_page((unsigned long)np->tx);
+ free_page((unsigned long)np->rx);
+ }
+
+ /* Move from CLOSED to DISCONNECTED state. */
+ np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
+ np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
+ memset(np->tx, 0, PAGE_SIZE);
+ memset(np->rx, 0, PAGE_SIZE);
+ np->backend_state = BEST_DISCONNECTED;
+
+ /* Construct an interface-CONNECT message for the domain controller. */
+ cmsg.type = CMSG_NETIF_FE;
+ cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT;
+ cmsg.length = sizeof(netif_fe_interface_connect_t);
+ up.handle = status->handle;
+ up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
+ up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
+ memcpy(cmsg.msg, &up, sizeof(up));
+
+ /* Tell the controller to bring up the interface. */
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+ break;
+
+ case NETIF_INTERFACE_STATUS_CONNECTED:
+ if ( np->backend_state == BEST_CLOSED )
+ {
+ printk(KERN_WARNING "Unexpected netif-CONNECTED message"
+ " in state %d\n", np->backend_state);
+ break;
+ }
+
+ memcpy(dev->dev_addr, status->mac, ETH_ALEN);
+
+ network_connect(dev, status);
+
+ np->evtchn = status->evtchn;
+ np->irq = bind_evtchn_to_irq(np->evtchn);
+ (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM,
+ dev->name, dev);
+
+ netctrl_connected_count();
+ break;
+
+ default:
+ printk(KERN_WARNING "Status change to unknown value %d\n",
+ status->status);
+ break;
+ }
+}
+
+/** Create a network device.
+ * @param handle device handle
+ * @param val return parameter for created device
+ * @return 0 on success, error code otherwise
+ */
+static int create_netdev(int handle, struct net_device **val)
+{
+ int i, err = 0;
+ struct net_device *dev = NULL;
+ struct net_private *np = NULL;
+
+ if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
+ {
+ printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ np = dev->priv;
+ np->backend_state = BEST_CLOSED;
+ np->user_state = UST_CLOSED;
+ np->handle = handle;
+
+ spin_lock_init(&np->tx_lock);
+ spin_lock_init(&np->rx_lock);
+
+ /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
+ for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
+ np->tx_skbs[i] = (void *)(i+1);
+ for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
+ np->rx_skbs[i] = (void *)(i+1);
+
+ dev->open = network_open;
+ dev->hard_start_xmit = network_start_xmit;
+ dev->stop = network_close;
+ dev->get_stats = network_get_stats;
+ dev->poll = netif_poll;
+ dev->weight = 64;
+
+ if ( (err = register_netdev(dev)) != 0 )
+ {
+ printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
+ goto exit;
+ }
+ np->dev = dev;
+ list_add(&np->list, &dev_list);
+
+ exit:
+ if ( (err != 0) && (dev != NULL ) )
+ kfree(dev);
+ else if ( val != NULL )
+ *val = dev;
+ return err;
+}
+
+/*
+ * Initialize the network control interface. Set the number of network devices
+ * and create them.
+ */
+static void netif_driver_status_change(
+ netif_fe_driver_status_changed_t *status)
+{
+ int err = 0;
+ int i;
+
+ netctrl.interface_n = status->nr_interfaces;
+ netctrl.connected_n = 0;
+
+ for ( i = 0; i < netctrl.interface_n; i++ )
+ {
+ if ( (err = create_netdev(i, NULL)) != 0 )
+ {
+ netctrl_err(err);
+ break;
+ }
+ }
+}
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ int respond = 1;
+
+ switch ( msg->subtype )
+ {
+ case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+ if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
+ goto error;
+ netif_status_change((netif_fe_interface_status_changed_t *)
+ &msg->msg[0]);
+ break;
+
+ case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+ if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
+ goto error;
+ netif_driver_status_change((netif_fe_driver_status_changed_t *)
+ &msg->msg[0]);
+ /* Message is a response */
+ respond = 0;
+ break;
+
+ error:
+ default:
+ msg->length = 0;
+ break;
+ }
+
+ if ( respond )
+ ctrl_if_send_response(msg);
+}
+
+
+static int __init netif_init(void)
+{
+ ctrl_msg_t cmsg;
+ netif_fe_driver_status_changed_t st;
+ int err = 0, wait_i, wait_n = 20;
+
+ if ( (start_info.flags & SIF_INITDOMAIN) ||
+ (start_info.flags & SIF_NET_BE_DOMAIN) )
+ return 0;
+
+ printk("Initialising Xen virtual ethernet frontend driver");
+
+ INIT_LIST_HEAD(&dev_list);
+
+ netctrl_init();
+
+ (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_NETIF_FE;
+ cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+ cmsg.length = sizeof(netif_fe_driver_status_changed_t);
+ st.status = NETIF_DRIVER_STATUS_UP;
+ st.nr_interfaces = 0;
+ memcpy(cmsg.msg, &st, sizeof(st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+ /* Wait for all interfaces to be connected. */
+ for ( wait_i = 0; ; wait_i++)
+ {
+ if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
+ {
+ err = (err > 0) ? 0 : err;
+ break;
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ }
+
+ if ( err )
+ ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
+
+ return err;
+}
+
+__initcall(netif_init);
#define xen_flush_page_update_queue() (_flush_page_update_queue())
void MULTICALL_flush_page_update_queue(void);
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+/* Allocate a contiguous empty region of low memory. Return virtual start. */
+unsigned long allocate_empty_lowmem_region(unsigned long pages);
+/* Deallocate a contiguous region of low memory. Return it to the allocator. */
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages);
+#endif
/*
* Assembler stubs for hyper-calls.
"b" (page_nr), "c" ((new_val).pte_low), "d" (flags) : "memory" );
if ( unlikely(ret < 0) )
- panic("Failed update VA mapping: %08lx, %08lx, %08lx",
- page_nr, (new_val).pte_low, flags);
-
+ {
+ printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
+ page_nr, (new_val).pte_low, flags);
+ BUG();
+ }
+
return ret;
}
--- /dev/null
+/*
+ * linux/mm/page_alloc.c
+ *
+ * Manages the free list, the system allocates free pages here.
+ * Note that kmalloc() lives in slab.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+
+DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
+struct pglist_data *pgdat_list;
+unsigned long totalram_pages;
+unsigned long totalhigh_pages;
+int nr_swap_pages;
+int numnodes = 1;
+int sysctl_lower_zone_protection = 0;
+
+EXPORT_SYMBOL(totalram_pages);
+EXPORT_SYMBOL(nr_swap_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+ if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+ return 1;
+ if (page_to_pfn(page) < zone->zone_start_pfn)
+ return 1;
+ if (zone != page_zone(page))
+ return 1;
+ return 0;
+}
+
+static void bad_page(const char *function, struct page *page)
+{
+ printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+ function, current->comm, page);
+ printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
+ (unsigned long)page->flags, page->mapping,
+ (int)page->mapcount, page_count(page));
+ printk(KERN_EMERG "Backtrace:\n");
+ dump_stack();
+ printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
+ page->flags &= ~(1 << PG_private |
+ 1 << PG_locked |
+ 1 << PG_lru |
+ 1 << PG_active |
+ 1 << PG_dirty |
+ 1 << PG_maplock |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
+ 1 << PG_writeback);
+ set_page_count(page, 0);
+ page->mapping = NULL;
+ page->mapcount = 0;
+}
+
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages". They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set. All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's ->index
+ * This is only for debug at present. This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ page[1].mapping = 0;
+ page[1].index = order;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ SetPageCompound(p);
+ p->private = (unsigned long)page;
+ }
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ if (!PageCompound(page))
+ return;
+
+ if (page[1].index != order)
+ bad_page(__FUNCTION__, page);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ if (!PageCompound(p))
+ bad_page(__FUNCTION__, page);
+ if (p->private != (unsigned long)page)
+ bad_page(__FUNCTION__, page);
+ ClearPageCompound(p);
+ }
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep one bit for each pair of blocks, which
+ * is set to 1 iff only one of the pair is allocated. So when we
+ * are allocating or freeing one, we can derive the state of the
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.
+ *
+ * -- wli
+ */
+
+static inline void __free_pages_bulk (struct page *page, struct page *base,
+ struct zone *zone, struct free_area *area, unsigned long mask,
+ unsigned int order)
+{
+ unsigned long page_idx, index;
+
+ if (order)
+ destroy_compound_page(page, order);
+ page_idx = page - base;
+ if (page_idx & ~mask)
+ BUG();
+ index = page_idx >> (1 + order);
+
+ zone->free_pages -= mask;
+ while (mask + (1 << (MAX_ORDER-1))) {
+ struct page *buddy1, *buddy2;
+
+ BUG_ON(area >= zone->free_area + MAX_ORDER);
+ if (!__test_and_change_bit(index, area->map))
+ /*
+ * the buddy page is still allocated.
+ */
+ break;
+ /*
+ * Move the buddy up one level.
+ * This code is taking advantage of the identity:
+ * -mask = 1+~mask
+ */
+ buddy1 = base + (page_idx ^ -mask);
+ buddy2 = base + page_idx;
+ BUG_ON(bad_range(zone, buddy1));
+ BUG_ON(bad_range(zone, buddy2));
+ list_del(&buddy1->lru);
+ mask <<= 1;
+ area++;
+ index >>= 1;
+ page_idx &= mask;
+ }
+ list_add(&(base + page_idx)->lru, &area->free_list);
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+ if ( page_mapped(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_lru |
+ 1 << PG_private |
+ 1 << PG_locked |
+ 1 << PG_active |
+ 1 << PG_reclaim |
+ 1 << PG_slab |
+ 1 << PG_maplock |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
+ 1 << PG_writeback )))
+ bad_page(function, page);
+ if (PageDirty(page))
+ ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages.
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static int
+free_pages_bulk(struct zone *zone, int count,
+ struct list_head *list, unsigned int order)
+{
+ unsigned long mask, flags;
+ struct free_area *area;
+ struct page *base, *page = NULL;
+ int ret = 0;
+
+ mask = (~0UL) << order;
+ base = zone->zone_mem_map;
+ area = zone->free_area + order;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->all_unreclaimable = 0;
+ zone->pages_scanned = 0;
+ while (!list_empty(list) && count--) {
+ page = list_entry(list->prev, struct page, lru);
+ /* have to delete it as __free_pages_bulk list manipulates */
+ list_del(&page->lru);
+ __free_pages_bulk(page, base, zone, area, mask, order);
+ ret++;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+ LIST_HEAD(list);
+ int i;
+
+ mod_page_state(pgfree, 1 << order);
+ for (i = 0 ; i < (1 << order) ; ++i)
+ free_pages_check(__FUNCTION__, page + i);
+ list_add(&page->lru, &list);
+ kernel_map_pages(page, 1<<order, 0);
+ free_pages_bulk(page_zone(page), 1, &list, order);
+}
+
+#define MARK_USED(index, order, area) \
+ __change_bit((index) >> (1+(order)), (area)->map)
+
+static inline struct page *
+expand(struct zone *zone, struct page *page,
+ unsigned long index, int low, int high, struct free_area *area)
+{
+ unsigned long size = 1 << high;
+
+ while (high > low) {
+ BUG_ON(bad_range(zone, page));
+ area--;
+ high--;
+ size >>= 1;
+ list_add(&page->lru, &area->free_list);
+ MARK_USED(index, high, area);
+ index += size;
+ page += size;
+ }
+ return page;
+}
+
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+ set_page_count(page, 1);
+#else
+ int i;
+
+ /*
+ * We need to reference all the pages for this order, otherwise if
+ * anyone accesses one of the pages with (get/put) it will be freed.
+ */
+ for (i = 0; i < (1 << order); i++)
+ set_page_count(page+i, 1);
+#endif /* CONFIG_MMU */
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static void prep_new_page(struct page *page, int order)
+{
+ if (page->mapping || page_mapped(page) ||
+ (page->flags & (
+ 1 << PG_private |
+ 1 << PG_locked |
+ 1 << PG_lru |
+ 1 << PG_active |
+ 1 << PG_dirty |
+ 1 << PG_reclaim |
+ 1 << PG_maplock |
+ 1 << PG_anon |
+ 1 << PG_swapcache |
+ 1 << PG_writeback )))
+ bad_page(__FUNCTION__, page);
+
+ page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_arch_1 |
+ 1 << PG_checked | 1 << PG_mappedtodisk);
+ page->private = 0;
+ set_page_refs(page, order);
+}
+
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+ struct free_area * area;
+ unsigned int current_order;
+ struct page *page;
+ unsigned int index;
+
+ for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ area = zone->free_area + current_order;
+ if (list_empty(&area->free_list))
+ continue;
+
+ page = list_entry(area->free_list.next, struct page, lru);
+ list_del(&page->lru);
+ index = page - zone->zone_mem_map;
+ if (current_order != MAX_ORDER-1)
+ MARK_USED(index, current_order, area);
+ zone->free_pages -= 1UL << order;
+ return expand(zone, page, index, order, current_order, area);
+ }
+
+ return NULL;
+}
+
+/*
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency. Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ unsigned long count, struct list_head *list)
+{
+ unsigned long flags;
+ int i;
+ int allocated = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (i = 0; i < count; ++i) {
+ page = __rmqueue(zone, order);
+ if (page == NULL)
+ break;
+ allocated++;
+ list_add_tail(&page->lru, list);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return allocated;
+}
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+ struct zone *zone;
+ int i;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset;
+
+ pset = &zone->pageset[cpu];
+ for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &pset->pcp[i];
+ pcp->count -= free_pages_bulk(zone, pcp->count,
+ &pcp->list, 0);
+ }
+ }
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+int is_head_of_free_region(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+ int order;
+ struct list_head *curr;
+
+ /*
+ * Should not matter as we need quiescent system for
+ * suspend anyway, but...
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = MAX_ORDER - 1; order >= 0; --order)
+ list_for_each(curr, &zone->free_area[order].free_list)
+ if (page == list_entry(curr, struct page, lru)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 1 << order;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 0;
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __drain_pages(smp_processor_id());
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_PM */
+
+static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+ unsigned long flags;
+ int cpu;
+ pg_data_t *pg = z->zone_pgdat;
+ pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+ struct per_cpu_pageset *p;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ p = &z->pageset[cpu];
+ if (pg == orig) {
+ z->pageset[cpu].numa_hit++;
+ } else {
+ p->numa_miss++;
+ zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ }
+ if (pg == NODE_DATA(numa_node_id()))
+ p->local_node++;
+ else
+ p->other_node++;
+ local_irq_restore(flags);
+#endif
+}
+
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ unsigned long flags;
+
+ /* XXX Xen: use mapping pointer as skb/data-page destructor */
+ if (page->mapping)
+ return (*(void(*)(struct page *))page->mapping)(page);
+
+ kernel_map_pages(page, 1, 0);
+ inc_page_state(pgfree);
+ free_pages_check(__FUNCTION__, page);
+ pcp = &zone->pageset[get_cpu()].pcp[cold];
+ local_irq_save(flags);
+ if (pcp->count >= pcp->high)
+ pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ list_add(&page->lru, &pcp->list);
+ pcp->count++;
+ local_irq_restore(flags);
+ put_cpu();
+}
+
+void fastcall free_hot_page(struct page *page)
+{
+ free_hot_cold_page(page, 0);
+}
+
+void fastcall free_cold_page(struct page *page)
+{
+ free_hot_cold_page(page, 1);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
+ * we cheat by calling it from here, in the order > 0 path. Saves a branch
+ * or two.
+ */
+
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+ int cold = !!(gfp_flags & __GFP_COLD);
+
+ if (order == 0) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[get_cpu()].pcp[cold];
+ local_irq_save(flags);
+ if (pcp->count <= pcp->low)
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, &pcp->list);
+ if (pcp->count) {
+ page = list_entry(pcp->list.next, struct page, lru);
+ list_del(&page->lru);
+ pcp->count--;
+ }
+ local_irq_restore(flags);
+ put_cpu();
+ }
+
+ if (page == NULL) {
+ spin_lock_irqsave(&zone->lock, flags);
+ page = __rmqueue(zone, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ if (page != NULL) {
+ BUG_ON(bad_range(zone, page));
+ mod_page_state_zone(zone, pgalloc, 1 << order);
+ prep_new_page(page, order);
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
+ }
+ return page;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ *
+ * Herein lies the mysterious "incremental min". That's the
+ *
+ * local_low = z->pages_low;
+ * min += local_low;
+ *
+ * thing. The intent here is to provide additional protection to low zones for
+ * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM
+ * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
+ * request. This preserves additional space in those lower zones for requests
+ * which really do need memory from those zones. It means that on a decent
+ * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
+ * zone untouched.
+ */
+struct page * fastcall
+__alloc_pages(unsigned int gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ const int wait = gfp_mask & __GFP_WAIT;
+ unsigned long min;
+ struct zone **zones;
+ struct page *page;
+ struct reclaim_state reclaim_state;
+ struct task_struct *p = current;
+ int i;
+ int alloc_type;
+ int do_retry;
+
+ might_sleep_if(wait);
+
+ zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
+ if (zones[0] == NULL) /* no zones in the zonelist */
+ return NULL;
+
+ alloc_type = zone_idx(zones[0]);
+
+ /* Go through the zonelist once, looking for a zone with enough free */
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *z = zones[i];
+
+ min = (1<<order) + z->protection[alloc_type];
+
+ /*
+ * We let real-time tasks dip their real-time paws a little
+ * deeper into reserves.
+ */
+ if (rt_task(p))
+ min -= z->pages_low >> 1;
+
+ if (z->free_pages >= min ||
+ (!wait && z->free_pages >= z->pages_high)) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, z);
+ goto got_pg;
+ }
+ }
+ }
+
+ /* we're somewhat low on memory, failed to find what we needed */
+ for (i = 0; zones[i] != NULL; i++)
+ wakeup_kswapd(zones[i]);
+
+ /* Go through the zonelist again, taking __GFP_HIGH into account */
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *z = zones[i];
+
+ min = (1<<order) + z->protection[alloc_type];
+
+ if (gfp_mask & __GFP_HIGH)
+ min -= z->pages_low >> 2;
+ if (rt_task(p))
+ min -= z->pages_low >> 1;
+
+ if (z->free_pages >= min ||
+ (!wait && z->free_pages >= z->pages_high)) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, z);
+ goto got_pg;
+ }
+ }
+ }
+
+ /* here we're in the low on memory slow path */
+
+rebalance:
+ if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+ /* go through the zonelist yet again, ignoring mins */
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *z = zones[i];
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, z);
+ goto got_pg;
+ }
+ }
+ goto nopage;
+ }
+
+ /* Atomic allocations - we can't balance anything */
+ if (!wait)
+ goto nopage;
+
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ try_to_free_pages(zones, gfp_mask, order);
+
+ p->reclaim_state = NULL;
+ p->flags &= ~PF_MEMALLOC;
+
+ /* go through the zonelist yet one more time */
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *z = zones[i];
+
+ min = (1UL << order) + z->protection[alloc_type];
+
+ if (z->free_pages >= min ||
+ (!wait && z->free_pages >= z->pages_high)) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, z);
+ goto got_pg;
+ }
+ }
+ }
+
+ /*
+ * Don't let big-order allocations loop unless the caller explicitly
+ * requests that. Wait for some write requests to complete then retry.
+ *
+ * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
+ * may not be true in other implementations.
+ */
+ do_retry = 0;
+ if (!(gfp_mask & __GFP_NORETRY)) {
+ if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+ do_retry = 1;
+ if (gfp_mask & __GFP_NOFAIL)
+ do_retry = 1;
+ }
+ if (do_retry) {
+ blk_congestion_wait(WRITE, HZ/50);
+ goto rebalance;
+ }
+
+nopage:
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+ printk(KERN_WARNING "%s: page allocation failure."
+ " order:%d, mode:0x%x\n",
+ p->comm, order, gfp_mask);
+ dump_stack();
+ }
+ return NULL;
+got_pg:
+ kernel_map_pages(page, 1 << order, 1);
+ return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+#ifdef CONFIG_NUMA
+/* Early boot: Everything is done by one cpu, but the data structures will be
+ * used by all cpus - spread them on all nodes.
+ */
+static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
+{
+static int nodenr;
+ int i = nodenr;
+ struct page *page;
+
+ for (;;) {
+ if (i > nodenr + numnodes)
+ return 0;
+ if (node_present_pages(i%numnodes)) {
+ struct zone **z;
+ /* The node contains memory. Check that there is
+ * memory in the intended zonelist.
+ */
+ z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
+ while (*z) {
+ if ( (*z)->free_pages > (1UL<<order))
+ goto found_node;
+ z++;
+ }
+ }
+ i++;
+ }
+found_node:
+ nodenr = i+1;
+ page = alloc_pages_node(i%numnodes, gfp_mask, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
+}
+#endif
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page * page;
+
+#ifdef CONFIG_NUMA
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return get_boot_pages(gfp_mask, order);
+#endif
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
+{
+ struct page * page;
+
+ /*
+ * get_zeroed_page() returns a 32-bit address, which cannot represent
+ * a highmem page
+ */
+ BUG_ON(gfp_mask & __GFP_HIGHMEM);
+
+ page = alloc_pages(gfp_mask, 0);
+ if (page) {
+ void *address = page_address(page);
+ clear_page(address);
+ return (unsigned long) address;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+ int i = pagevec_count(pvec);
+
+ while (--i >= 0)
+ free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+ if (!PageReserved(page) && put_page_testzero(page)) {
+ if (order == 0)
+ free_hot_page(page);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ BUG_ON(!virt_addr_valid(addr));
+ __free_pages(virt_to_page(addr), order);
+ }
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+ unsigned int sum = 0;
+ struct zone *zone;
+
+ for_each_zone(zone)
+ sum += zone->free_pages;
+
+ return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+unsigned int nr_used_zone_pages(void)
+{
+ unsigned int pages = 0;
+ struct zone *zone;
+
+ for_each_zone(zone)
+ pages += zone->nr_active + zone->nr_inactive;
+
+ return pages;
+}
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ sum += pgdat->node_zones[i].free_pages;
+
+ return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+ pg_data_t *pgdat;
+ unsigned int sum = 0;
+
+ for_each_pgdat(pgdat) {
+ struct zonelist *zonelist = pgdat->node_zonelists + offset;
+ struct zone **zonep = zonelist->zones;
+ struct zone *zone;
+
+ for (zone = *zonep++; zone; zone = *zonep++) {
+ unsigned long size = zone->present_pages;
+ unsigned long high = zone->pages_high;
+ if (size > high)
+ sum += size - high;
+ }
+ }
+
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+ return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+ return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+ pg_data_t *pgdat;
+ unsigned int pages = 0;
+
+ for_each_pgdat(pgdat)
+ pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+ return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+ printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone) do { } while (0)
+#endif
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+DEFINE_PER_CPU(struct page_state, page_states) = {0};
+EXPORT_PER_CPU_SYMBOL(page_states);
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+void __get_page_state(struct page_state *ret, int nr)
+{
+ int cpu = 0;
+
+ memset(ret, 0, sizeof(*ret));
+ while (cpu < NR_CPUS) {
+ unsigned long *in, *out, off;
+
+ if (!cpu_possible(cpu)) {
+ cpu++;
+ continue;
+ }
+
+ in = (unsigned long *)&per_cpu(page_states, cpu);
+ cpu++;
+ if (cpu < NR_CPUS && cpu_possible(cpu))
+ prefetch(&per_cpu(page_states, cpu));
+ out = (unsigned long *)ret;
+ for (off = 0; off < nr; off++)
+ *out++ += *in++;
+ }
+}
+
+void get_page_state(struct page_state *ret)
+{
+ int nr;
+
+ nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+ nr /= sizeof(unsigned long);
+
+ __get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+ __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
+unsigned long __read_page_state(unsigned offset)
+{
+ unsigned long ret = 0;
+ int cpu;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ unsigned long in;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+ ret += *((unsigned long *)in);
+ }
+ return ret;
+}
+
+void get_zone_counts(unsigned long *active,
+ unsigned long *inactive, unsigned long *free)
+{
+ struct zone *zone;
+
+ *active = 0;
+ *inactive = 0;
+ *free = 0;
+ for_each_zone(zone) {
+ *active += zone->nr_active;
+ *inactive += zone->nr_inactive;
+ *free += zone->free_pages;
+ }
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages;
+ val->sharedram = 0;
+ val->freeram = nr_free_pages();
+ val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+ val->totalhigh = totalhigh_pages;
+ val->freehigh = nr_free_highpages();
+#else
+ val->totalhigh = 0;
+ val->freehigh = 0;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ val->totalram = pgdat->node_present_pages;
+ val->freeram = nr_free_pages_pgdat(pgdat);
+ val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+ struct page_state ps;
+ int cpu, temperature;
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long free;
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ show_node(zone);
+ printk("%s per-cpu:", zone->name);
+
+ if (!zone->present_pages) {
+ printk(" empty\n");
+ continue;
+ } else
+ printk("\n");
+
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ struct per_cpu_pageset *pageset;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ pageset = zone->pageset + cpu;
+
+ for (temperature = 0; temperature < 2; temperature++)
+ printk("cpu %d %s: low %d, high %d, batch %d\n",
+ cpu,
+ temperature ? "cold" : "hot",
+ pageset->pcp[temperature].low,
+ pageset->pcp[temperature].high,
+ pageset->pcp[temperature].batch);
+ }
+ }
+
+ get_page_state(&ps);
+ get_zone_counts(&active, &inactive, &free);
+
+ printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+ K(nr_free_pages()),
+ K(nr_free_highpages()));
+
+ printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+ "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ active,
+ inactive,
+ ps.nr_dirty,
+ ps.nr_writeback,
+ ps.nr_unstable,
+ nr_free_pages(),
+ ps.nr_slab,
+ ps.nr_mapped,
+ ps.nr_page_table_pages);
+
+ for_each_zone(zone) {
+ int i;
+
+ show_node(zone);
+ printk("%s"
+ " free:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " active:%lukB"
+ " inactive:%lukB"
+ " present:%lukB"
+ "\n",
+ zone->name,
+ K(zone->free_pages),
+ K(zone->pages_min),
+ K(zone->pages_low),
+ K(zone->pages_high),
+ K(zone->nr_active),
+ K(zone->nr_inactive),
+ K(zone->present_pages)
+ );
+ printk("protections[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %lu", zone->protection[i]);
+ printk("\n");
+ }
+
+ for_each_zone(zone) {
+ struct list_head *elem;
+ unsigned long nr, flags, order, total = 0;
+
+ show_node(zone);
+ printk("%s: ", zone->name);
+ if (!zone->present_pages) {
+ printk("empty\n");
+ continue;
+ }
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ nr = 0;
+ list_for_each(elem, &zone->free_area[order].free_list)
+ ++nr;
+ total += nr << order;
+ printk("%lu*%lukB ", nr, K(1UL) << order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ printk("= %lukB\n", K(total));
+ }
+
+ show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+{
+ switch (k) {
+ struct zone *zone;
+ default:
+ BUG();
+ case ZONE_HIGHMEM:
+ zone = pgdat->node_zones + ZONE_HIGHMEM;
+ if (zone->present_pages) {
+#ifndef CONFIG_HIGHMEM
+ BUG();
+#endif
+ zonelist->zones[j++] = zone;
+ }
+ case ZONE_NORMAL:
+ zone = pgdat->node_zones + ZONE_NORMAL;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ case ZONE_DMA:
+ zone = pgdat->node_zones + ZONE_DMA;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+
+ return j;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (numnodes)
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given
+ * node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: pointer to the bitmap of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list. The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, void *used_node_mask)
+{
+ int i, n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for (i = 0; i < numnodes; i++) {
+ cpumask_t tmp;
+
+ /* Start from local node */
+ n = (node+i)%numnodes;
+
+ /* Don't want a node to appear more than once */
+ if (test_bit(n, used_node_mask))
+ continue;
+
+ /* Use the distance array to find the distance */
+ val = node_distance(node, n);
+
+ /* Give preference to headless and unused nodes */
+ tmp = node_to_cpumask(n);
+ if (!cpus_empty(tmp))
+ val += PENALTY_FOR_NODE_WITH_CPUS;
+
+ /* Slight preference for less loaded node */
+ val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val += node_load[n];
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ if (best_node >= 0)
+ set_bit(best_node, used_node_mask);
+
+ return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+ int prev_node, load;
+ struct zonelist *zonelist;
+ DECLARE_BITMAP(used_mask, MAX_NUMNODES);
+
+ /* initialize zonelists */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ memset(zonelist, 0, sizeof(*zonelist));
+ zonelist->zones[0] = NULL;
+ }
+
+ /* NUMA-aware ordering of nodes */
+ local_node = pgdat->node_id;
+ load = numnodes;
+ prev_node = local_node;
+ bitmap_zero(used_mask, MAX_NUMNODES);
+ while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
+ /*
+ * We don't want to pressure a particular node.
+ * So adding penalty to the first node in same
+ * distance group to make it round-robin.
+ */
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
+ node_load[node] += load;
+ prev_node = node;
+ load--;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ for (j = 0; zonelist->zones[j] != NULL; j++);
+
+ k = ZONE_NORMAL;
+ if (i & __GFP_HIGHMEM)
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA)
+ k = ZONE_DMA;
+
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ zonelist->zones[j] = NULL;
+ }
+ }
+}
+
+#else /* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+
+ local_node = pgdat->node_id;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zonelist *zonelist;
+
+ zonelist = pgdat->node_zonelists + i;
+ memset(zonelist, 0, sizeof(*zonelist));
+
+ j = 0;
+ k = ZONE_NORMAL;
+ if (i & __GFP_HIGHMEM)
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA)
+ k = ZONE_DMA;
+
+ j = build_zonelists_node(pgdat, zonelist, j, k);
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < numnodes; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ for (node = 0; node < local_node; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+
+ zonelist->zones[j] = NULL;
+ }
+}
+
+#endif /* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+ int i;
+
+ for(i = 0 ; i < numnodes ; i++)
+ build_zonelists(NODE_DATA(i));
+ printk("Built %i zonelists\n", numnodes);
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE 256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+ unsigned long size = 1;
+
+ pages /= PAGES_PER_WAITQUEUE;
+
+ while (size < pages)
+ size <<= 1;
+
+ /*
+ * Once we have dozens or even hundreds of threads sleeping
+ * on IO we've got bigger problems than wait queue collision.
+ * Limit the size of the wait table to a reasonable size.
+ */
+ size = min(size, 4096UL);
+
+ return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+ return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ totalpages += zones_size[i];
+ pgdat->node_spanned_pages = totalpages;
+
+ realtotalpages = totalpages;
+ if (zholes_size)
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -= zholes_size[i];
+ pgdat->node_present_pages = realtotalpages;
+ printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ struct page *page;
+
+ for (page = start; page < (start + size); page++) {
+ set_page_zone(page, NODEZONE(nid, zone));
+ set_page_count(page, 0);
+ SetPageReserved(page);
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (zone != ZONE_HIGHMEM)
+ set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+#endif
+ start_pfn++;
+ }
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(start, size, nid, zone, start_pfn) \
+ memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long i, j;
+ const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+ int cpu, nid = pgdat->node_id;
+ struct page *lmem_map = pgdat->node_mem_map;
+ unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+ pgdat->nr_zones = 0;
+ init_waitqueue_head(&pgdat->kswapd_wait);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, realsize;
+ unsigned long batch;
+
+ zone_table[NODEZONE(nid, j)] = zone;
+ realsize = size = zones_size[j];
+ if (zholes_size)
+ realsize -= zholes_size[j];
+
+ zone->spanned_pages = size;
+ zone->present_pages = realsize;
+ zone->name = zone_names[j];
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
+ zone->zone_pgdat = pgdat;
+ zone->free_pages = 0;
+
+ zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone->pageset[cpu].pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[1]; /* cold */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ printk(" %s zone: %lu pages, LIFO batch:%lu\n",
+ zone_names[j], realsize, batch);
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_list);
+ atomic_set(&zone->nr_scan_active, 0);
+ atomic_set(&zone->nr_scan_inactive, 0);
+ zone->nr_active = 0;
+ zone->nr_inactive = 0;
+ if (!size)
+ continue;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(size);
+ zone->wait_table_bits =
+ wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, zone->wait_table_size
+ * sizeof(wait_queue_head_t));
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+
+ pgdat->nr_zones = j+1;
+
+ zone->zone_mem_map = lmem_map;
+ zone->zone_start_pfn = zone_start_pfn;
+
+ if ((zone_start_pfn) & (zone_required_alignment-1))
+ printk("BUG: wrong zone alignment, it will crash\n");
+
+ memmap_init(lmem_map, size, nid, j, zone_start_pfn);
+
+ zone_start_pfn += size;
+ lmem_map += size;
+
+ for (i = 0; ; i++) {
+ unsigned long bitmap_size;
+
+ INIT_LIST_HEAD(&zone->free_area[i].free_list);
+ if (i == MAX_ORDER-1) {
+ zone->free_area[i].map = NULL;
+ break;
+ }
+
+ /*
+ * Page buddy system uses "index >> (i+1)",
+ * where "index" is at most "size-1".
+ *
+ * The extra "+3" is to round down to byte
+ * size (8 bits per byte assumption). Thus
+ * we get "(size-1) >> (i+4)" as the last byte
+ * we can access.
+ *
+ * The "+1" is because we want to round the
+ * byte allocation up rather than down. So
+ * we should have had a "+7" before we shifted
+ * down by three. Also, we have to add one as
+ * we actually _use_ the last bit (it's [0,n]
+ * inclusive, not [0,n[).
+ *
+ * So we actually had +7+1 before we shift
+ * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+ * (modulo overflows, which we do not have).
+ *
+ * Finally, we LONG_ALIGN because all bitmap
+ * operations are on longs.
+ */
+ bitmap_size = (size-1) >> (i+4);
+ bitmap_size = LONG_ALIGN(bitmap_size+1);
+ zone->free_area[i].map =
+ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+ }
+ }
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+ struct page *node_mem_map, unsigned long *zones_size,
+ unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+ unsigned long size;
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+ calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+ if (!node_mem_map) {
+ size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+ node_mem_map = alloc_bootmem_node(pgdat, size);
+ }
+ pgdat->node_mem_map = node_mem_map;
+
+ free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+
+void __init free_area_init(unsigned long *zones_size)
+{
+ free_area_init_node(0, &contig_page_data, NULL, zones_size,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+ mem_map = contig_page_data.node_mem_map;
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+ pg_data_t *pgdat;
+ loff_t node = *pos;
+
+ for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+ --node;
+
+ return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ (*pos)++;
+ return pgdat->pgdat_next;
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/*
+ * This walks the freelist for each zone. Whilst this is slow, I'd rather
+ * be slow here than slow down the fast path by keeping stats - mjbligh
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+ int order;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ unsigned long nr_bufs = 0;
+ struct list_head *elem;
+
+ list_for_each(elem, &(zone->free_area[order].free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+ return 0;
+}
+
+struct seq_operations fragmentation_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = frag_show,
+};
+
+static char *vmstat_text[] = {
+ "nr_dirty",
+ "nr_writeback",
+ "nr_unstable",
+ "nr_page_table_pages",
+ "nr_mapped",
+ "nr_slab",
+
+ "pgpgin",
+ "pgpgout",
+ "pswpin",
+ "pswpout",
+ "pgalloc_high",
+
+ "pgalloc_normal",
+ "pgalloc_dma",
+ "pgfree",
+ "pgactivate",
+ "pgdeactivate",
+
+ "pgfault",
+ "pgmajfault",
+ "pgrefill_high",
+ "pgrefill_normal",
+ "pgrefill_dma",
+
+ "pgsteal_high",
+ "pgsteal_normal",
+ "pgsteal_dma",
+ "pgscan_kswapd_high",
+ "pgscan_kswapd_normal",
+
+ "pgscan_kswapd_dma",
+ "pgscan_direct_high",
+ "pgscan_direct_normal",
+ "pgscan_direct_dma",
+ "pginodesteal",
+
+ "slabs_scanned",
+ "kswapd_steal",
+ "kswapd_inodesteal",
+ "pageoutrun",
+ "allocstall",
+
+ "pgrotated",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+ struct page_state *ps;
+
+ if (*pos >= ARRAY_SIZE(vmstat_text))
+ return NULL;
+
+ ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+ m->private = ps;
+ if (!ps)
+ return ERR_PTR(-ENOMEM);
+ get_full_page_state(ps);
+ ps->pgpgin /= 2; /* sectors -> kbytes */
+ ps->pgpgout /= 2;
+ return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= ARRAY_SIZE(vmstat_text))
+ return NULL;
+ return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+ unsigned long *l = arg;
+ unsigned long off = l - (unsigned long *)m->private;
+
+ seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+ return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+ kfree(m->private);
+ m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+ .start = vmstat_start,
+ .next = vmstat_next,
+ .stop = vmstat_stop,
+ .show = vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ long *count;
+
+ if (action == CPU_DEAD) {
+ /* Drain local pagecache count. */
+ count = &per_cpu(nr_pagecache_local, cpu);
+ atomic_add(*count, &nr_pagecache);
+ *count = 0;
+ local_irq_disable();
+ __drain_pages(cpu);
+ local_irq_enable();
+ }
+ return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+ hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+ int alloc_type)
+{
+ int z_idx = zone_idx(z);
+ struct zone *higherzone;
+ unsigned long pages;
+
+ /* there is no higher zone to get a contribution from */
+ if (z_idx == MAX_NR_ZONES-1)
+ return 0;
+
+ higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+ /* We always start with the higher zone's protection value */
+ pages = higherzone->protection[alloc_type];
+
+ /*
+ * We get a lower-zone-protection contribution only if there are
+ * pages in the higher zone and if we're not the highest zone
+ * in the current zonelist. e.g., never happens for GFP_DMA. Happens
+ * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+ * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+ */
+ if (higherzone->present_pages && z_idx < alloc_type)
+ pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+ return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ * sysctl_lower_zone_protection changes. Ensures that each zone
+ * has a correct pages_protected value, so an adequate number of
+ * pages are left in the zone after a successful __alloc_pages().
+ *
+ * This algorithm is way confusing. I tries to keep the same behavior
+ * as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+ struct pglist_data *pgdat;
+ struct zone *zones, *zone;
+ int max_zone;
+ int i, j;
+
+ for_each_pgdat(pgdat) {
+ zones = pgdat->node_zones;
+
+ for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+ if (zones[i].present_pages)
+ max_zone = i;
+
+ /*
+ * For each of the different allocation types:
+ * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+ */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ /*
+ * For each of the zones:
+ * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+ */
+ for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+ zone = &zones[j];
+
+ /*
+ * We never protect zones that don't have memory
+ * in them (j>max_zone) or zones that aren't in
+ * the zonelists for a certain type of
+ * allocation (j>i). We have to assign these to
+ * zero because the lower zones take
+ * contributions from the higher zones.
+ */
+ if (j > max_zone || j > i) {
+ zone->protection[i] = 0;
+ continue;
+ }
+ /*
+ * The contribution of the next higher zone
+ */
+ zone->protection[i] = higherzone_val(zone,
+ max_zone, i);
+ zone->protection[i] += zone->pages_low;
+ }
+ }
+ }
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
+ * that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
+ */
+static void setup_per_zone_pages_min(void)
+{
+ unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long lowmem_pages = 0;
+ struct zone *zone;
+ unsigned long flags;
+
+ /* Calculate total number of !ZONE_HIGHMEM pages */
+ for_each_zone(zone) {
+ if (!is_highmem(zone))
+ lowmem_pages += zone->present_pages;
+ }
+
+ for_each_zone(zone) {
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ if (is_highmem(zone)) {
+ /*
+ * Often, highmem doesn't need to reserve any pages.
+ * But the pages_min/low/high values are also used for
+ * batching up page reclaim activity so we need a
+ * decent value here.
+ */
+ int min_pages;
+
+ min_pages = zone->present_pages / 1024;
+ if (min_pages < SWAP_CLUSTER_MAX)
+ min_pages = SWAP_CLUSTER_MAX;
+ if (min_pages > 128)
+ min_pages = 128;
+ zone->pages_min = min_pages;
+ } else {
+ /* if it's a lowmem zone, reserve a number of pages
+ * proportionate to the zone's size.
+ */
+ zone->pages_min = (pages_min * zone->present_pages) /
+ lowmem_pages;
+ }
+
+ zone->pages_low = zone->pages_min * 2;
+ zone->pages_high = zone->pages_min * 3;
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ }
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min). For large machines
+ * we want it large (16MB max). But it is not linear, because network
+ * bandwidth does not increase linearly with machine size. We use
+ *
+ * min_free_kbytes = sqrt(lowmem_kbytes)
+ *
+ * which yields
+ *
+ * 16MB: 128k
+ * 32MB: 181k
+ * 64MB: 256k
+ * 128MB: 362k
+ * 256MB: 512k
+ * 512MB: 724k
+ * 1024MB: 1024k
+ * 2048MB: 1448k
+ * 4096MB: 2048k
+ * 8192MB: 2896k
+ * 16384MB: 4096k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+ unsigned long lowmem_kbytes;
+
+ lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+ min_free_kbytes = int_sqrt(lowmem_kbytes);
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 16384)
+ min_free_kbytes = 16384;
+ setup_per_zone_pages_min();
+ setup_per_zone_protection();
+ return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
+ * that we can call two helper functions whenever min_free_kbytes
+ * changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length)
+{
+ proc_dointvec(table, write, file, buffer, length);
+ setup_per_zone_pages_min();
+ setup_per_zone_protection();
+ return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ * proc_dointvec() so that we can call setup_per_zone_protection()
+ * whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length)
+{
+ proc_dointvec_minmax(table, write, file, buffer, length);
+ setup_per_zone_protection();
+ return 0;
+}